├── README.md ├── licenses ├── lucene-LICENSE.txt └── lucene-NOTICE.txt ├── pom.xml └── src └── main ├── assemblies └── plugin.xml ├── java └── org │ ├── analyzer │ └── lucene │ │ ├── SDAnalyzer.java │ │ └── SDTokenizer.java │ └── elasticsearch │ ├── index │ └── analysis │ │ ├── SDAnalyzerProvider.java │ │ └── SDTokenizerFactory.java │ └── plugin │ └── analysis │ └── AnalysisSDPlugin.java └── resources ├── plugin-descriptor.properties └── plugin-security.policy /README.md: -------------------------------------------------------------------------------- 1 | # Stanford Core NLP Analyzer for Elasticsearch 2 | 3 | This project is used for build a plugin for elasticsearch to involve the Stanford NLP analyzer. 4 | 5 | ## Stanford NLP? 6 | 7 | The stanford NLP project is an open-source project (which is licensed under the GNU General Public License V3 or later)\ 8 | which used for providing a set of human language technology tools\ 9 | This project as its name said, was maintaining by the Standford group.\ 10 | ref:\ 11 | [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html)\ 12 | [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP) 13 | 14 | ## Why stanford core NLP? 15 | 16 | Yes, we do have several open-source analyzers for ES, include the words popular IK, Jieba and 17 | some other NLP analyzers provided by some group and companies. 18 | And yes, each of them has their advantages. 19 | 20 | But having compared a branch of test cases among all these analyzers of both open-source ones and commercialized, 21 | we found that it seems the Stanford NLP is the most fitful for our project as we need not only sentences' separate, 22 | but also the sentiments' analyze. 23 | 24 | ## Why this project? 25 | 26 | I searched both Google and Github and asked for help on the professional forum 27 | ([elasticsearch China](https://elasticsearch.cn/)), 28 | and found that It seems not to have a stable project which could provide this requirement. 29 | 30 | Well, there is only one choice for me, build a fitful plugin to make it done. :) 31 | 32 | ## Use? 33 | 34 | ### install by git 35 | 1. I suppose your machine has prepared the Java JDK 8, Maven, Git .etc 36 | 2. git clone this project 37 | 3. maven package this project `mvn clean install -e -U` 38 | 4. copy the jar packages into plugin folder: 39 | 1. the project jar of course 40 | 1. stanford-corenlp-3.9.2.jar 41 | 1. stanford-chinese-corenlp-models-current.jar 42 | 1. commons-logging-1.2.jar 43 | 4. restart the ES and enjoy 44 | 45 | ### install by archive file 46 | 1. download the release file 47 | 2. untar into ES plugin folder 48 | 3. cp the stanford-*.jar files into this folder 49 | 4. also, download the stanford-chinese-corenlp-models-current.jar into this folder as it's too large to upload 50 | 3. restart the ES and enjoy 51 | 52 | ## Quick Example 53 | 1. Create an index 54 | ```bash 55 | curl -XPUT http://localhost:9200/index -H 'Content-Type:application/json' -d' 56 | { 57 | "settings": { 58 | "number_of_replicas": 0, 59 | "number_of_shards": 1 60 | }, 61 | "mappings": { 62 | "_doc": { 63 | "properties": { 64 | "id": { 65 | "type": "integer" 66 | }, 67 | "text": { 68 | "type": "text", 69 | "analyzer": "stanford-core-nlp", 70 | "search_analyzer": "stanford-core-nlp" 71 | } 72 | } 73 | } 74 | } 75 | } 76 | ' 77 | ``` 78 | 79 | 2. Index some docs 80 | ```bash 81 | curl -XPOST http://localhost:9200/index/_doc/1 -H 'Content-Type:application/json' -d' 82 | {"id":1, "text":"中美贸易摩擦到贸易战"} 83 | ' 84 | ``` 85 | 86 | ```bash 87 | curl -XPOST http://localhost:9200/index/_doc/2 -H 'Content-Type:application/json' -d' 88 | {"id":2, "text":"美国和墨西哥重新签订美墨贸易协定"} 89 | ' 90 | ``` 91 | 92 | ```bash 93 | curl -XPOST http://localhost:9200/index/_doc/3 -H 'Content-Type:application/json' -d' 94 | {"id":3, "text":"知乎裁员意味着互联网寒冬的到来"} 95 | ' 96 | ``` 97 | 98 | ```bash 99 | curl -XPOST http://localhost:9200/index/_doc/4 -H 'Content-Type:application/json' -d' 100 | {"id":4, "text":"比亚迪公开被撕拖欠款项,声称欠款方伪造公章"} 101 | ' 102 | ``` 103 | 104 | 3.query with highlighting 105 | 106 | ```bash 107 | curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d' 108 | { 109 | "query" : { "match" : { "text" : "美国" }}, 110 | "highlight" : { 111 | "pre_tags" : ["", ""], 112 | "post_tags" : ["", ""], 113 | "fields" : { 114 | "text" : {} 115 | } 116 | } 117 | } 118 | ' 119 | ``` 120 | 121 | Result 122 | ```json 123 | { 124 | "took": 192, 125 | "timed_out": false, 126 | "_shards": { 127 | "total": 1, 128 | "successful": 1, 129 | "skipped": 0, 130 | "failed": 0 131 | }, 132 | "hits": { 133 | "total": 2, 134 | "max_score": 0.92510056, 135 | "hits": [ 136 | { 137 | "_index": "index", 138 | "_type": "_doc", 139 | "_id": "1", 140 | "_score": 0.92510056, 141 | "_source": { 142 | "id": 1, 143 | "text": "美国政府瘫痪" 144 | }, 145 | "highlight": { 146 | "text": [ 147 | "美国政府瘫痪" 148 | ] 149 | } 150 | }, 151 | { 152 | "_index": "index", 153 | "_type": "_doc", 154 | "_id": "2", 155 | "_score": 0.65024257, 156 | "_source": { 157 | "id": 2, 158 | "text": "美国和墨西哥重新签订美墨贸易协定" 159 | }, 160 | "highlight": { 161 | "text": [ 162 | "美国和墨西哥重新签订美墨贸易协定" 163 | ] 164 | } 165 | } 166 | ] 167 | } 168 | } 169 | ``` 170 | 171 | ## Shit happens? 172 | 1. xx access denied (e.g. java.lang.RuntimePermission xxxx)?\ 173 | -> Add these into plugin-security.policy 174 | * permission java.lang.RuntimePermission "*"; 175 | * permission java.lang.reflect.ReflectPermission "*"; 176 | -> Edit the `jvm.options ` 177 | * add the following cmd 178 | -Djava.security.policy=file://${dir of this}/plugin-security.policy 179 | 180 | 2. xx gc xx overhead?\ 181 | -> Edit the `jvm.options` 182 | * change the Xms && Xmx to be a larger size 183 | 184 | 185 | -------------------------------------------------------------------------------- /licenses/lucene-LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | 205 | 206 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was 207 | derived from unicode conversion examples available at 208 | http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright 209 | from those sources: 210 | 211 | /* 212 | * Copyright 2001-2004 Unicode, Inc. 213 | * 214 | * Disclaimer 215 | * 216 | * This source code is provided as is by Unicode, Inc. No claims are 217 | * made as to fitness for any particular purpose. No warranties of any 218 | * kind are expressed or implied. The recipient agrees to determine 219 | * applicability of information provided. If this file has been 220 | * purchased on magnetic or optical media from Unicode, Inc., the 221 | * sole remedy for any claim will be exchange of defective media 222 | * within 90 days of receipt. 223 | * 224 | * Limitations on Rights to Redistribute This Code 225 | * 226 | * Unicode, Inc. hereby grants the right to freely use the information 227 | * supplied in this file in the creation of products supporting the 228 | * Unicode Standard, and to make copies of this file in any form 229 | * for internal or external distribution as long as this notice 230 | * remains attached. 231 | */ 232 | 233 | 234 | Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was 235 | derived from Python 2.4.2 sources available at 236 | http://www.python.org. Full license is here: 237 | 238 | http://www.python.org/download/releases/2.4.2/license/ 239 | 240 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was 241 | derived from Python 3.1.2 sources available at 242 | http://www.python.org. Full license is here: 243 | 244 | http://www.python.org/download/releases/3.1.2/license/ 245 | 246 | Some code in core/src/java/org/apache/lucene/util/automaton was 247 | derived from Brics automaton sources available at 248 | www.brics.dk/automaton/. Here is the copyright from those sources: 249 | 250 | /* 251 | * Copyright (c) 2001-2009 Anders Moeller 252 | * All rights reserved. 253 | * 254 | * Redistribution and use in source and binary forms, with or without 255 | * modification, are permitted provided that the following conditions 256 | * are met: 257 | * 1. Redistributions of source code must retain the above copyright 258 | * notice, this list of conditions and the following disclaimer. 259 | * 2. Redistributions in binary form must reproduce the above copyright 260 | * notice, this list of conditions and the following disclaimer in the 261 | * documentation and/or other materials provided with the distribution. 262 | * 3. The name of the author may not be used to endorse or promote products 263 | * derived from this software without specific prior written permission. 264 | * 265 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 266 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 267 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 268 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 269 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 270 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 271 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 272 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 273 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 274 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 275 | */ 276 | 277 | The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton 278 | were automatically generated with the moman/finenight FSA package. 279 | Here is the copyright for those sources: 280 | 281 | # Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, 282 | # 283 | # Permission is hereby granted, free of charge, to any person 284 | # obtaining a copy of this software and associated documentation 285 | # files (the "Software"), to deal in the Software without 286 | # restriction, including without limitation the rights to use, 287 | # copy, modify, merge, publish, distribute, sublicense, and/or sell 288 | # copies of the Software, and to permit persons to whom the 289 | # Software is furnished to do so, subject to the following 290 | # conditions: 291 | # 292 | # The above copyright notice and this permission notice shall be 293 | # included in all copies or substantial portions of the Software. 294 | # 295 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 296 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 297 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 298 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 299 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 300 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 301 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 302 | # OTHER DEALINGS IN THE SOFTWARE. 303 | 304 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was 305 | derived from ICU (http://www.icu-project.org) 306 | The full license is available here: 307 | http://source.icu-project.org/repos/icu/icu/trunk/license.html 308 | 309 | /* 310 | * Copyright (C) 1999-2010, International Business Machines 311 | * Corporation and others. All Rights Reserved. 312 | * 313 | * Permission is hereby granted, free of charge, to any person obtaining a copy 314 | * of this software and associated documentation files (the "Software"), to deal 315 | * in the Software without restriction, including without limitation the rights 316 | * to use, copy, modify, merge, publish, distribute, and/or sell copies of the 317 | * Software, and to permit persons to whom the Software is furnished to do so, 318 | * provided that the above copyright notice(s) and this permission notice appear 319 | * in all copies of the Software and that both the above copyright notice(s) and 320 | * this permission notice appear in supporting documentation. 321 | * 322 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 323 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 324 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. 325 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE 326 | * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR 327 | * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 328 | * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 329 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 330 | * 331 | * Except as contained in this notice, the name of a copyright holder shall not 332 | * be used in advertising or otherwise to promote the sale, use or other 333 | * dealings in this Software without prior written authorization of the 334 | * copyright holder. 335 | */ 336 | 337 | The following license applies to the Snowball stemmers: 338 | 339 | Copyright (c) 2001, Dr Martin Porter 340 | Copyright (c) 2002, Richard Boulton 341 | All rights reserved. 342 | 343 | Redistribution and use in source and binary forms, with or without 344 | modification, are permitted provided that the following conditions are met: 345 | 346 | * Redistributions of source code must retain the above copyright notice, 347 | * this list of conditions and the following disclaimer. 348 | * Redistributions in binary form must reproduce the above copyright 349 | * notice, this list of conditions and the following disclaimer in the 350 | * documentation and/or other materials provided with the distribution. 351 | * Neither the name of the copyright holders nor the names of its contributors 352 | * may be used to endorse or promote products derived from this software 353 | * without specific prior written permission. 354 | 355 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 356 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 357 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 358 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 359 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 360 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 361 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 362 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 363 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 364 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 365 | 366 | The following license applies to the KStemmer: 367 | 368 | Copyright © 2003, 369 | Center for Intelligent Information Retrieval, 370 | University of Massachusetts, Amherst. 371 | All rights reserved. 372 | 373 | Redistribution and use in source and binary forms, with or without modification, 374 | are permitted provided that the following conditions are met: 375 | 376 | 1. Redistributions of source code must retain the above copyright notice, this 377 | list of conditions and the following disclaimer. 378 | 379 | 2. Redistributions in binary form must reproduce the above copyright notice, 380 | this list of conditions and the following disclaimer in the documentation 381 | and/or other materials provided with the distribution. 382 | 383 | 3. The names "Center for Intelligent Information Retrieval" and 384 | "University of Massachusetts" must not be used to endorse or promote products 385 | derived from this software without prior written permission. To obtain 386 | permission, contact info@ciir.cs.umass.edu. 387 | 388 | THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS 389 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 390 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 391 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE 392 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 393 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 394 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 395 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 396 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 397 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 398 | SUCH DAMAGE. 399 | 400 | The following license applies to the Morfologik project: 401 | 402 | Copyright (c) 2006 Dawid Weiss 403 | Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski 404 | All rights reserved. 405 | 406 | Redistribution and use in source and binary forms, with or without modification, 407 | are permitted provided that the following conditions are met: 408 | 409 | * Redistributions of source code must retain the above copyright notice, 410 | this list of conditions and the following disclaimer. 411 | 412 | * Redistributions in binary form must reproduce the above copyright notice, 413 | this list of conditions and the following disclaimer in the documentation 414 | and/or other materials provided with the distribution. 415 | 416 | * Neither the name of Morfologik nor the names of its contributors 417 | may be used to endorse or promote products derived from this software 418 | without specific prior written permission. 419 | 420 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 421 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 422 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 423 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 424 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 425 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 426 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 427 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 428 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 429 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 430 | 431 | --- 432 | 433 | The dictionary comes from Morfologik project. Morfologik uses data from 434 | Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and 435 | is licenced on the terms of (inter alia) LGPL and Creative Commons 436 | ShareAlike. The part-of-speech tags were added in Morfologik project and 437 | are not found in the data from sjp.pl. The tagset is similar to IPI PAN 438 | tagset. 439 | 440 | --- 441 | 442 | The following license applies to the Morfeusz project, 443 | used by org.apache.lucene.analysis.morfologik. 444 | 445 | BSD-licensed dictionary of Polish (SGJP) 446 | http://sgjp.pl/morfeusz/ 447 | 448 | Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, 449 | Marcin Woliński, Robert Wołosz 450 | 451 | All rights reserved. 452 | 453 | Redistribution and use in source and binary forms, with or without 454 | modification, are permitted provided that the following conditions are 455 | met: 456 | 457 | 1. Redistributions of source code must retain the above copyright 458 | notice, this list of conditions and the following disclaimer. 459 | 460 | 2. Redistributions in binary form must reproduce the above copyright 461 | notice, this list of conditions and the following disclaimer in the 462 | documentation and/or other materials provided with the 463 | distribution. 464 | 465 | THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS 466 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 467 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 468 | DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE 469 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 470 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 471 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 472 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 473 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 474 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 475 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 476 | -------------------------------------------------------------------------------- /licenses/lucene-NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Lucene 2 | Copyright 2014 The Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | 7 | Includes software from other Apache Software Foundation projects, 8 | including, but not limited to: 9 | - Apache Ant 10 | - Apache Jakarta Regexp 11 | - Apache Commons 12 | - Apache Xerces 13 | 14 | ICU4J, (under analysis/icu) is licensed under an MIT styles license 15 | and Copyright (c) 1995-2008 International Business Machines Corporation and others 16 | 17 | Some data files (under analysis/icu/src/data) are derived from Unicode data such 18 | as the Unicode Character Database. See http://unicode.org/copyright.html for more 19 | details. 20 | 21 | Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is 22 | BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ 23 | 24 | The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were 25 | automatically generated with the moman/finenight FSA library, created by 26 | Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, 27 | see http://sites.google.com/site/rrettesite/moman and 28 | http://bitbucket.org/jpbarrette/moman/overview/ 29 | 30 | The class org.apache.lucene.util.WeakIdentityMap was derived from 31 | the Apache CXF project and is Apache License 2.0. 32 | 33 | The Google Code Prettify is Apache License 2.0. 34 | See http://code.google.com/p/google-code-prettify/ 35 | 36 | JUnit (junit-4.10) is licensed under the Common Public License v. 1.0 37 | See http://junit.sourceforge.net/cpl-v10.html 38 | 39 | This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin 40 | g Package (jaspell): http://jaspell.sourceforge.net/ 41 | License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) 42 | 43 | The snowball stemmers in 44 | analysis/common/src/java/net/sf/snowball 45 | were developed by Martin Porter and Richard Boulton. 46 | The snowball stopword lists in 47 | analysis/common/src/resources/org/apache/lucene/analysis/snowball 48 | were developed by Martin Porter and Richard Boulton. 49 | The full snowball package is available from 50 | http://snowball.tartarus.org/ 51 | 52 | The KStem stemmer in 53 | analysis/common/src/org/apache/lucene/analysis/en 54 | was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) 55 | under the BSD-license. 56 | 57 | The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default 58 | stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: 59 | analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, 60 | analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, 61 | analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, 62 | analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, 63 | analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt 64 | See http://members.unine.ch/jacques.savoy/clef/index.html. 65 | 66 | The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers 67 | (common) are based on BSD-licensed reference implementations created by Jacques Savoy and 68 | Ljiljana Dolamic. These files reside in: 69 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java 70 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java 71 | analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java 72 | analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java 73 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java 74 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java 75 | analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java 76 | analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java 77 | analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java 78 | analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java 79 | analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java 80 | 81 | The Stempel analyzer (stempel) includes BSD-licensed software developed 82 | by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, 83 | and Edmond Nolan. 84 | 85 | The Polish analyzer (stempel) comes with a default 86 | stopword list that is BSD-licensed created by the Carrot2 project. The file resides 87 | in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. 88 | See http://project.carrot2.org/license.html. 89 | 90 | The SmartChineseAnalyzer source code (smartcn) was 91 | provided by Xiaoping Gao and copyright 2009 by www.imdict.net. 92 | 93 | WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) 94 | is derived from Unicode data such as the Unicode Character Database. 95 | See http://unicode.org/copyright.html for more details. 96 | 97 | The Morfologik analyzer (morfologik) includes BSD-licensed software 98 | developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/). 99 | 100 | Morfologik uses data from Polish ispell/myspell dictionary 101 | (http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia) 102 | LGPL and Creative Commons ShareAlike. 103 | 104 | Morfologic includes data from BSD-licensed dictionary of Polish (SGJP) 105 | (http://sgjp.pl/morfeusz/) 106 | 107 | Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original 108 | source code for this can be found at http://www.eclipse.org/jetty/downloads.php 109 | 110 | =========================================================================== 111 | Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration 112 | =========================================================================== 113 | 114 | This software includes a binary and/or source version of data from 115 | 116 | mecab-ipadic-2.7.0-20070801 117 | 118 | which can be obtained from 119 | 120 | http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz 121 | 122 | or 123 | 124 | http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz 125 | 126 | =========================================================================== 127 | mecab-ipadic-2.7.0-20070801 Notice 128 | =========================================================================== 129 | 130 | Nara Institute of Science and Technology (NAIST), 131 | the copyright holders, disclaims all warranties with regard to this 132 | software, including all implied warranties of merchantability and 133 | fitness, in no event shall NAIST be liable for 134 | any special, indirect or consequential damages or any damages 135 | whatsoever resulting from loss of use, data or profits, whether in an 136 | action of contract, negligence or other tortuous action, arising out 137 | of or in connection with the use or performance of this software. 138 | 139 | A large portion of the dictionary entries 140 | originate from ICOT Free Software. The following conditions for ICOT 141 | Free Software applies to the current dictionary as well. 142 | 143 | Each User may also freely distribute the Program, whether in its 144 | original form or modified, to any third party or parties, PROVIDED 145 | that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear 146 | on, or be attached to, the Program, which is distributed substantially 147 | in the same form as set out herein and that such intended 148 | distribution, if actually made, will neither violate or otherwise 149 | contravene any of the laws and regulations of the countries having 150 | jurisdiction over the User or the intended distribution itself. 151 | 152 | NO WARRANTY 153 | 154 | The program was produced on an experimental basis in the course of the 155 | research and development conducted during the project and is provided 156 | to users as so produced on an experimental basis. Accordingly, the 157 | program is provided without any warranty whatsoever, whether express, 158 | implied, statutory or otherwise. The term "warranty" used herein 159 | includes, but is not limited to, any warranty of the quality, 160 | performance, merchantability and fitness for a particular purpose of 161 | the program and the nonexistence of any infringement or violation of 162 | any right of any third party. 163 | 164 | Each user of the program will agree and understand, and be deemed to 165 | have agreed and understood, that there is no warranty whatsoever for 166 | the program and, accordingly, the entire risk arising from or 167 | otherwise connected with the program is assumed by the user. 168 | 169 | Therefore, neither ICOT, the copyright holder, or any other 170 | organization that participated in or was otherwise related to the 171 | development of the program and their respective officials, directors, 172 | officers and other employees shall be held liable for any and all 173 | damages, including, without limitation, general, special, incidental 174 | and consequential damages, arising out of or otherwise in connection 175 | with the use or inability to use the program or any product, material 176 | or result produced or otherwise obtained by using the program, 177 | regardless of whether they have been advised of, or otherwise had 178 | knowledge of, the possibility of such damages at any time during the 179 | project or thereafter. Each user will be deemed to have agreed to the 180 | foregoing by his or her commencement of use of the program. The term 181 | "use" as used herein includes, but is not limited to, the use, 182 | modification, copying and distribution of the program and the 183 | production of secondary products from the program. 184 | 185 | In the case where the program, whether in its original form or 186 | modified, was distributed or delivered to or received by a user from 187 | any person, organization or entity other than ICOT, unless it makes or 188 | grants independently of ICOT any specific warranty to the user in 189 | writing, such person, organization or entity, will also be exempted 190 | from and not be held liable to the user for any such damages as noted 191 | above as far as the program is concerned. 192 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | godLockin 7 | esStanfordNLPAnalyzer 8 | ${elasticsearch.version} 9 | jar 10 | Stanford Core NLP Analyzer for Elasticsearch 11 | 2018 12 | 13 | 14 | 15 | The Apache Software License, Version 2.0 16 | http://www.apache.org/licenses/LICENSE-2.0.txt 17 | repo 18 | 19 | 20 | 21 | 22 | 23 | Steven 24 | stevenchenworking@gmail.com 25 | godLockin 26 | 27 | 28 | 29 | 30 | scm:git:git@github.com:godlockin/esStanfordNLPAnalyzer.git 31 | scm:git:git@github.com:godlockin/esStanfordNLPAnalyzer.git 32 | https://github.com/godlockin/esStanfordNLPAnalyzer 33 | 34 | 35 | 36 | 1.8 37 | 1.8 38 | 6.5.0 39 | ${project.basedir}/src/main/assemblies/plugin.xml 40 | analysis-stanford-nlp 41 | org.elasticsearch.plugin.analysis.AnalysisSDPlugin 42 | true 43 | 44 | 3.9.2 45 | 3.5.1 46 | 2.8 47 | 48 | 49 | 50 | 51 | org.elasticsearch 52 | elasticsearch 53 | ${elasticsearch.version} 54 | compile 55 | 56 | 57 | 58 | edu.stanford.nlp 59 | stanford-corenlp 60 | ${stanfordNLP.version} 61 | 62 | 63 | edu.stanford.nlp 64 | stanford-corenlp 65 | ${stanfordNLP.version} 66 | models 67 | 68 | 69 | edu.stanford.nlp 70 | stanford-corenlp 71 | ${stanfordNLP.version} 72 | models-chinese 73 | 74 | 75 | 76 | 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-compiler-plugin 81 | ${org.apache.maven.compiler.plugin.version} 82 | 83 | ${maven.compiler.target} 84 | ${maven.compiler.target} 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-dependency-plugin 90 | ${org.apache.maven.dependency.plugin.version} 91 | 92 | 93 | copy 94 | package 95 | 96 | copy 97 | 98 | 99 | 100 | 101 | edu.stanford.nlp 102 | stanford-corenlp 103 | ${stanfordNLP.version} 104 | models 105 | jar 106 | true 107 | ${basedir}/target 108 | stanford-corenlp-models-${stanfordNLP.version}.jar 109 | 110 | 111 | false 112 | true 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | analysis-stanfordNLP 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/config 11 | config 12 | 13 | 14 | 15 | 16 | 17 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 18 | 19 | true 20 | 21 | 22 | ${project.basedir}/src/main/resources/plugin-security.policy 23 | 24 | true 25 | 26 | 27 | 28 | 29 | 30 | true 31 | true 32 | 33 | org.elasticsearch:elasticsearch 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /src/main/java/org/analyzer/lucene/SDAnalyzer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Stanford Core NLP 中文分词器 版本 1.0 3 | * Stanford Core NLP Analyzer Release 1.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供 21 | * provided by Steven Chen 22 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html) 23 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP) 24 | */ 25 | package org.analyzer.lucene; 26 | 27 | import org.apache.lucene.analysis.Analyzer; 28 | 29 | public final class SDAnalyzer extends Analyzer { 30 | 31 | public SDAnalyzer(){ } 32 | 33 | @Override 34 | protected TokenStreamComponents createComponents(String fieldName) { 35 | return new TokenStreamComponents(new SDTokenizer()); 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/analyzer/lucene/SDTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Stanford Core NLP 中文分词器 版本 1.0 3 | * Stanford Core NLP Analyzer Release 1.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供 21 | * provided by Steven Chen 22 | * Ref: Stanford Core NLP project 23 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html) 24 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP) 25 | */ 26 | package org.analyzer.lucene; 27 | 28 | import edu.stanford.nlp.ling.CoreAnnotations; 29 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 30 | import org.apache.logging.log4j.LogManager; 31 | import org.apache.logging.log4j.Logger; 32 | import org.apache.lucene.analysis.Tokenizer; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 35 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 36 | import org.elasticsearch.SpecialPermission; 37 | 38 | import java.io.BufferedReader; 39 | import java.io.IOException; 40 | import java.security.AccessController; 41 | import java.security.PrivilegedAction; 42 | import java.util.*; 43 | import java.util.stream.Collectors; 44 | import java.util.stream.Stream; 45 | 46 | public final class SDTokenizer extends Tokenizer { 47 | 48 | private final CharTermAttribute termAtt; 49 | private final OffsetAttribute offsetAtt; 50 | private int endPosition; 51 | private Iterator wordsIter = Collections.emptyIterator(); 52 | 53 | private Set ignoreSymbols = new HashSet(Stream.of(("囧 ⊙ ● ○ ⊕ ◎ Θ ⊙ ¤ ㈱ ㊣ ★ ☆ ♀ ◆ ◇ ◣ ◢ ◥ ▲ ▼ △ ▽ ⊿ ◤ ◥ ▂ ▃ " + 54 | "▄ ▅ ▆ ▇ █ █ ■ ▓ □ 〓 ≡ ╝ ╚ ╔ ╗ ╬ ═ ╓ ╩ ┠ ┨ ┯ ┷ ┏ ┓ ┗ ┛ ┳ ⊥ 『 』 ┌ ┐ └ ┘ ∟ 「 」 ↑ ↓ → ← ↘ ↙ ♀ ♂ ┇ " + 55 | "┅ ﹉ ﹊ ﹍ ﹎ ╭ ╮ ╰ ╯ *^_^* ^*^ ^-^ ^_^ ^(^ ∵ ∴ ‖ | | ︴ ﹏ ﹋ ﹌ ( ) 〔 〕 【 】 〖 〗 @ : ! / \\ \" " + 56 | "_ < > ` , · 。 ≈ { } ~ ~ ( ) _ -『 』 √ $ @ * & # ※ 卐 々 ∞ Ψ ∪ ∩ ∈ ∏ の ℡ ぁ § ∮ ” 〃 ミ 灬 ξ № ∑ ⌒ ξ ζ ω * " + 57 | "\uE7E7 \uE7F3 ㄨ ≮ ≯ + - × ÷ + - ± / = ∫ ∮ ∝ ∞ ∧ ∨ ∑ ∏ ‖ ∠ ≌ ∽ ≤ ≥ ≈ < > じ ☆ ↑ ↓ ⊙ ● ★ ☆ ■ ♀ 『 』 Ψ" + 58 | " ※ → № ← ㊣ ∑ ⌒ 〖 〗 @ ξ ζ ω □ ∮ 〓 ※ ∴ ぷ ∏ 卐 【 】 △ √ ∩ ¤ 々 ♀ ♂ ∞ ① ㄨ ≡ ↘ ↙ ┗ ┛ ╰ ☆ ╮ ① ② ③ ④ ⑤ ⑥ ⑦ ⑧ ⑨ ⑩ " + 59 | "⑴ ⑵ ⑶ ⑷ ⑸ ⑹ ⑺ ⑻ ⑼ ⑽ ⑾ ⑿ ⒀ ⒁ ⒂ ⒃ ⒄ ⒅ ⒆ ⒇ 丨 丩 丬 丶 丷 丿 乀 乙 乂 乄 乆 乛 亅 亠 亻 冂 冫 冖 凵" + 60 | "\uE81C \uE81D \uE815 \uE816 \uE817 \uE818 \uE819 \uE81E \uE822 \uE823 \uE82B \uE82C\uE830 \uE831 \uE832 \uE833 \uE836 \uE838 \uE839 \uE83A \uE83B \uE83E \uE848 \uE81A \uE81B" + 61 | " 、 。 . ? ! ~ $ % @ & # * ? ; ∶ … ¨ , · ˙ ? ‘ ’ “ ” ” 〃 ‘ ′ 〃 ↑ ↓ ← → ↖ ↗ ↙ ↘ ㊣ ◎ ○ ● ⊕ ⊙ ○ ● △ ▲ ☆ ★ ◇ ◆ □ ■ ▽ ▼ § ¥ 〒" + 62 | " ¢ £ ※ ♀ ♂ α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ σ τ υ φ χ ψ ω C").split(" ")).filter(x -> !(null == x || "".equals(x.trim()))).collect(Collectors.toList())); 63 | private PositionIncrementAttribute posIncrAtt; 64 | 65 | private int increment = 0; 66 | private StanfordCoreNLP pipeline; 67 | 68 | public SDTokenizer(){ 69 | super(); 70 | 71 | offsetAtt = addAttribute(OffsetAttribute.class); 72 | termAtt = addAttribute(CharTermAttribute.class); 73 | posIncrAtt = addAttribute(PositionIncrementAttribute.class); 74 | } 75 | 76 | private StanfordCoreNLP instance() { 77 | if (null == pipeline) { 78 | synchronized (SDTokenizer.class) { 79 | if (null == pipeline) { 80 | Properties props = new Properties(); 81 | props.setProperty("annotators", "tokenize, ssplit"); 82 | props.setProperty("tokenize.language", "zh"); 83 | props.setProperty("segment.model", "edu/stanford/nlp/models/segmenter/chinese/ctb.gz"); 84 | props.setProperty("segment.sighanCorporaDict", "edu/stanford/nlp/models/segmenter/chinese"); 85 | props.setProperty("segment.serDictionary", "edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz"); 86 | props.setProperty("segment.sighanPostProcessing", "true"); 87 | props.setProperty("ssplit.boundaryTokenRegex", "[.。]|[!?!?]+"); 88 | 89 | SpecialPermission.check(); 90 | pipeline = AccessController.doPrivileged((PrivilegedAction) () -> new StanfordCoreNLP(props)); 91 | return pipeline; 92 | } 93 | } 94 | } 95 | return pipeline; 96 | } 97 | 98 | @Override 99 | public boolean incrementToken() { 100 | clearAttributes(); 101 | 102 | if(wordsIter.hasNext()){ 103 | String word = wordsIter.next(); 104 | int wordLength = word.length(); 105 | posIncrAtt.setPositionIncrement(increment + 1); 106 | termAtt.append(word); 107 | termAtt.setLength(wordLength); 108 | offsetAtt.setOffset(endPosition + 1, endPosition + 1 + wordLength); 109 | endPosition += wordLength; 110 | return true; 111 | } 112 | return false; 113 | } 114 | 115 | @Override 116 | public void reset() throws IOException { 117 | super.reset(); 118 | // reset the input content 119 | endPosition = -1; 120 | increment = 0; 121 | 122 | List words = new ArrayList<>(); 123 | try (BufferedReader br = new BufferedReader(input)) { 124 | String temp; 125 | StringBuilder stringBuilder = new StringBuilder(); 126 | while ((temp = br.readLine()) != null) { 127 | stringBuilder.append(temp.trim()); 128 | } 129 | 130 | words = instance().process(stringBuilder.toString().trim()) 131 | .get(CoreAnnotations.TokensAnnotation.class) 132 | .stream().map(x -> x.get(CoreAnnotations.TextAnnotation.class)) 133 | .filter(x -> !ignoreSymbols.contains(x)) 134 | .collect(Collectors.toList()); 135 | } catch (IOException e) { 136 | e.printStackTrace(); 137 | } finally { 138 | wordsIter = words.iterator(); 139 | } 140 | } 141 | 142 | @Override 143 | public final void end() throws IOException { 144 | super.end(); 145 | 146 | // set final offset 147 | int finalOffset = correctOffset(this.endPosition); 148 | offsetAtt.setOffset(finalOffset, finalOffset); 149 | posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + increment); 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/SDAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Stanford Core NLP 中文分词器 版本 1.0 3 | * Stanford Core NLP Analyzer Release 1.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供 21 | * provided by Steven Chen 22 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html) 23 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP) 24 | */ 25 | package org.elasticsearch.index.analysis; 26 | 27 | import org.elasticsearch.common.settings.Settings; 28 | import org.elasticsearch.env.Environment; 29 | import org.elasticsearch.index.IndexSettings; 30 | import org.analyzer.lucene.SDAnalyzer; 31 | 32 | public class SDAnalyzerProvider extends AbstractIndexAnalyzerProvider { 33 | 34 | public SDAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 35 | super(indexSettings, name, settings); 36 | } 37 | 38 | @Override 39 | public SDAnalyzer get() { 40 | return new SDAnalyzer(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/SDTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Stanford Core NLP 中文分词器 版本 1.0 3 | * Stanford Core NLP Analyzer Release 1.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供 21 | * provided by Steven Chen 22 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html) 23 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP) 24 | */ 25 | package org.elasticsearch.index.analysis; 26 | 27 | import org.apache.lucene.analysis.Tokenizer; 28 | import org.elasticsearch.common.settings.Settings; 29 | import org.elasticsearch.env.Environment; 30 | import org.elasticsearch.index.IndexSettings; 31 | import org.analyzer.lucene.SDTokenizer; 32 | 33 | public class SDTokenizerFactory extends AbstractTokenizerFactory { 34 | 35 | public SDTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 36 | super(indexSettings, name, settings); 37 | } 38 | 39 | @Override 40 | public Tokenizer create() { 41 | return new SDTokenizer(); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/AnalysisSDPlugin.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Stanford Core NLP 中文分词器 版本 1.0 3 | * Stanford Core NLP Analyzer Release 1.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由陈晨(stevenchenworking@gmail.com)提供 21 | * provided by Steven Chen 22 | * [Stanford CoreNLP home page](https://stanfordnlp.github.io/CoreNLP/index.html) 23 | * [Stanford CoreNLP GitHub page](https://github.com/stanfordnlp/CoreNLP) 24 | */ 25 | package org.elasticsearch.plugin.analysis; 26 | 27 | import org.apache.lucene.analysis.Analyzer; 28 | import org.elasticsearch.index.analysis.AnalyzerProvider; 29 | import org.elasticsearch.index.analysis.SDAnalyzerProvider; 30 | import org.elasticsearch.index.analysis.SDTokenizerFactory; 31 | import org.elasticsearch.index.analysis.TokenizerFactory; 32 | import org.elasticsearch.indices.analysis.AnalysisModule; 33 | import org.elasticsearch.plugins.AnalysisPlugin; 34 | import org.elasticsearch.plugins.Plugin; 35 | 36 | import java.util.HashMap; 37 | import java.util.Map; 38 | 39 | public class AnalysisSDPlugin extends Plugin implements AnalysisPlugin { 40 | 41 | private static final String PLUGIN_NAME = "stanford-core-nlp"; 42 | 43 | @Override 44 | public Map> getTokenizers() { 45 | Map> extra = new HashMap<>(); 46 | extra.put(PLUGIN_NAME, SDTokenizerFactory::new); 47 | return extra; 48 | } 49 | 50 | @Override 51 | public Map>> getAnalyzers() { 52 | Map>> extra = new HashMap<>(); 53 | extra.put(PLUGIN_NAME, SDAnalyzerProvider::new); 54 | return extra; 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=6.5.4 38 | #${project.version} 39 | # 40 | # 'name': the plugin name 41 | name=stanford-core-nlp 42 | #${elasticsearch.plugin.name} 43 | # 44 | # 'classname': the name of the class to load, fully-qualified. 45 | classname=org.elasticsearch.plugin.analysis.AnalysisSDPlugin 46 | #${elasticsearch.plugin.classname} 47 | # 48 | # 'java.version' version of java the code is built against 49 | # use the system property java.specification.version 50 | # version string must be a sequence of nonnegative decimal integers 51 | # separated by "."'s and may have leading zeros 52 | java.version=1.8 53 | #${maven.compiler.target} 54 | # 55 | # 'elasticsearch.version' version of elasticsearch compiled against 56 | # You will have to release a new version of the plugin for each new 57 | # elasticsearch release. This version is checked when the plugin 58 | # is loaded so Elasticsearch will refuse to start in the presence of 59 | # plugins with the incorrect elasticsearch.version. 60 | elasticsearch.version=6.5.4 61 | #${elasticsearch.version} 62 | -------------------------------------------------------------------------------- /src/main/resources/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant { 2 | // needed because of the hot reload functionality 3 | permission java.lang.RuntimePermission "*"; 4 | permission java.lang.reflect.ReflectPermission "*"; 5 | }; --------------------------------------------------------------------------------