├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── build.gradle ├── es-ik-sqlite3 ├── build.gradle ├── libs │ └── sqlite-jdbc-3.8.10.1.jar └── src │ ├── main │ ├── java │ │ └── io │ │ │ └── github │ │ │ └── zacker330 │ │ │ └── es │ │ │ └── ik │ │ │ └── es │ │ │ └── ik │ │ │ └── analyzer │ │ │ └── Sqlite3Configuration.java │ └── resources │ │ └── META-INF │ │ └── services │ │ └── org.elasticsearch.index.analysis.ik.spi.Configuration │ └── test │ ├── java │ ├── io │ │ └── github │ │ │ └── zacker330 │ │ │ └── es │ │ │ └── ik │ │ │ ├── AbstractIntegrationTest.java │ │ │ └── DictionaryDatasource.java │ └── org │ │ └── wltea │ │ └── analyzer │ │ ├── IKAnalzyerTest.java │ │ └── LuceneIndexAndSearchTest.java │ └── resources │ ├── database.sql │ ├── logback-test.xml │ ├── mainDic.properties │ ├── quantifierDic.properties │ └── stopwordDic.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── ik-analysis-core ├── build.gradle ├── config │ └── checkstyle │ │ └── checkstyle.xml └── src │ ├── main │ └── java │ │ └── org │ │ └── wltea │ │ └── analyzer │ │ ├── configuration │ │ └── DictionaryConfiguration.java │ │ ├── core │ │ ├── AnalyzeContext.java │ │ ├── CJKSegmenter.java │ │ ├── CN_QuantifierSegmenter.java │ │ ├── CharacterUtil.java │ │ ├── IKArbitrator.java │ │ ├── IKSegmenter.java │ │ ├── ISegmenter.java │ │ ├── LetterSegmenter.java │ │ ├── Lexeme.java │ │ ├── LexemePath.java │ │ └── QuickSortSet.java │ │ └── dic │ │ ├── DictSegment.java │ │ ├── Dictionary.java │ │ └── Hit.java │ └── test │ └── java │ └── org │ └── wltea │ └── analyzer │ ├── IKSegmenterTest.java │ └── MockDictionary.java ├── ik-analysis-es-plugin ├── .gitignore ├── build.gradle └── src │ ├── main │ ├── java │ │ └── org │ │ │ ├── elasticsearch │ │ │ ├── index │ │ │ │ └── analysis │ │ │ │ │ └── ik │ │ │ │ │ ├── IKAnalysisBinderProcessor.java │ │ │ │ │ ├── IKAnalyzerProvider.java │ │ │ │ │ ├── IKTokenizerFactory.java │ │ │ │ │ ├── NotFoundIKAnalyzerConfigurationImplementation.java │ │ │ │ │ └── spi │ │ │ │ │ └── Configuration.java │ │ │ └── plugin │ │ │ │ └── analyzer │ │ │ │ └── ik │ │ │ │ └── AnalysisIKPlugin.java │ │ │ └── wltea │ │ │ └── analyzer │ │ │ └── lucene │ │ │ ├── IKAnalyzer.java │ │ │ └── IKTokenizer.java │ └── resources │ │ └── es-plugin.properties │ └── test │ ├── java │ ├── IkESPluginTest.java │ └── org │ │ └── elasticsearch │ │ └── index │ │ └── analysis │ │ └── ik │ │ └── MockConfiguration.java │ └── resources │ └── META-INF │ └── services │ └── org.elasticsearch.index.analysis.ik.spi.Configuration └── settings.gradle /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | target/ 3 | *.iws 4 | *.ipr 5 | *.iml 6 | build/ 7 | .gradle/* 8 | buildSrc/.gradle/ 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk8 4 | - oraclejdk7 5 | 6 | notifications: 7 | email: true 8 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | 205 | 206 | Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was 207 | derived from unicode conversion examples available at 208 | http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright 209 | from those sources: 210 | 211 | /* 212 | * Copyright 2001-2004 Unicode, Inc. 213 | * 214 | * Disclaimer 215 | * 216 | * This source code is provided as is by Unicode, Inc. No claims are 217 | * made as to fitness for any particular purpose. No warranties of any 218 | * kind are expressed or implied. The recipient agrees to determine 219 | * applicability of information provided. If this file has been 220 | * purchased on magnetic or optical media from Unicode, Inc., the 221 | * sole remedy for any claim will be exchange of defective media 222 | * within 90 days of receipt. 223 | * 224 | * Limitations on Rights to Redistribute This Code 225 | * 226 | * Unicode, Inc. hereby grants the right to freely use the information 227 | * supplied in this file in the creation of products supporting the 228 | * Unicode Standard, and to make copies of this file in any form 229 | * for internal or external distribution as long as this notice 230 | * remains attached. 231 | */ 232 | 233 | 234 | Some code in src/java/org/apache/lucene/util/ArrayUtil.java was 235 | derived from Python 2.4.2 sources available at 236 | http://www.python.org. Full license is here: 237 | 238 | http://www.python.org/download/releases/2.4.2/license/ 239 | 240 | 241 | Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was 242 | derived from ICU (http://www.icu-project.org) 243 | The full license is available here: 244 | http://source.icu-project.org/repos/icu/icu/trunk/license.html 245 | 246 | /* 247 | * Copyright (C) 1999-2010, International Business Machines 248 | * Corporation and others. All Rights Reserved. 249 | * 250 | * Permission is hereby granted, free of charge, to any person obtaining a copy 251 | * of this software and associated documentation files (the "Software"), to deal 252 | * in the Software without restriction, including without limitation the rights 253 | * to use, copy, modify, merge, publish, distribute, and/or sell copies of the 254 | * Software, and to permit persons to whom the Software is furnished to do so, 255 | * provided that the above copyright notice(s) and this permission notice appear 256 | * in all copies of the Software and that both the above copyright notice(s) and 257 | * this permission notice appear in supporting documentation. 258 | * 259 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 260 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 261 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. 262 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE 263 | * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR 264 | * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 265 | * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 266 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 267 | * 268 | * Except as contained in this notice, the name of a copyright holder shall not 269 | * be used in advertising or otherwise to promote the sale, use or other 270 | * dealings in this Software without prior written authorization of the 271 | * copyright holder. 272 | */ 273 | 274 | The following license applies to the Snowball stemmers: 275 | 276 | Copyright (c) 2001, Dr Martin Porter 277 | Copyright (c) 2002, Richard Boulton 278 | All rights reserved. 279 | 280 | Redistribution and use in source and binary forms, with or without 281 | modification, are permitted provided that the following conditions are met: 282 | 283 | * Redistributions of source code must retain the above copyright notice, 284 | * this list of conditions and the following disclaimer. 285 | * Redistributions in binary form must reproduce the above copyright 286 | * notice, this list of conditions and the following disclaimer in the 287 | * documentation and/or other materials provided with the distribution. 288 | * Neither the name of the copyright holders nor the names of its contributors 289 | * may be used to endorse or promote products derived from this software 290 | * without specific prior written permission. 291 | 292 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 293 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 294 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 295 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 296 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 297 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 298 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 299 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 300 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 301 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 302 | 303 | The following license applies to the KStemmer: 304 | 305 | Copyright © 2003, 306 | Center for Intelligent Information Retrieval, 307 | University of Massachusetts, Amherst. 308 | All rights reserved. 309 | 310 | Redistribution and use in source and binary forms, with or without modification, 311 | are permitted provided that the following conditions are met: 312 | 313 | 1. Redistributions of source code must retain the above copyright notice, this 314 | list of conditions and the following disclaimer. 315 | 316 | 2. Redistributions in binary form must reproduce the above copyright notice, 317 | this list of conditions and the following disclaimer in the documentation 318 | and/or other materials provided with the distribution. 319 | 320 | 3. The names "Center for Intelligent Information Retrieval" and 321 | "University of Massachusetts" must not be used to endorse or promote products 322 | derived from this software without prior written permission. To obtain 323 | permission, contact info@ciir.cs.umass.edu. 324 | 325 | THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS 326 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 327 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 328 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE 329 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 330 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 331 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 332 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 333 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 334 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 335 | SUCH DAMAGE. 336 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Lucene 2 | Copyright 2011 The Apache Software Foundation 3 | 4 | This product includes software developed by 5 | The Apache Software Foundation (http://www.apache.org/). 6 | 7 | 8 | The IKAnalyzer 2012 source code (under org/wltea) was 9 | provided by Linliangyi and copyright 2012 by Oolong studio 10 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kind of Chinese Analysis for Elasticsearch [![Build Status](https://travis-ci.org/zacker330/es-ik.svg?branch=master)](https://travis-ci.org/zacker330/es-ik) 2 | 3 | # Requirements 4 | 5 | - Java 7 update 55 or later 6 | 7 | # Structure of es-ik 8 | 9 | * ik-analysis-core 10 | 11 | The algorithm of this module is coming from [ik-analyzer](https://code.google.com/p/ik-analyzer/). In principle, you can use this module to implement a Solor analyzer plugin or a Elasticsearch plugin. 12 | 13 | You just need implement `DictionaryConfiguration` interface to provide dictionary content which is used by analysing content process. 14 | 15 | * ik-analysis-es-plugin: 16 | 17 | Integrate with ik-analyzer-core module and Elasticsearch. Define a kind of [SPI](https://en.wikipedia.org/wiki/Service_provider_interface) which is `Configuration` extends `DictionaryConfiguration` 18 | 19 | * es-ik-sqlite3 20 | 21 | Persist dictionary's content into Sqlite3 database. This module is a kind of `service provider` to SPI Configuration defined in ik-analysis-es-plugin. 22 | 23 | 24 | # How to use es-ik 25 | 26 | Actually, ik-analysis-es-plugin expose a interface `DictionaryConfiguration` a kind of SPI. es-ik-sqlite3 implement it so that ik-analysis-es-plugin can get dictionary's content from Sqlite. In other words, you can get your implementation like persisting dictionary's content into Redis. 27 | 28 | SPI is just a kind of concept. In java, I use [ServiceLoader](https://docs.oracle.com/javase/6/docs/api/java/util/ServiceLoader.html) to implement that. As soon as your implementation conforms with ServiceLoader's usage, don't need to change ik-analysis-es-plugin module, you'll get a new ik-analysis-es-plugin's plugin. :P 29 | 30 | 31 | 32 | 33 | # How to use es-ik-sqlite3(currently version 1.0.1) 34 | 35 | 36 | - tell elasticsearch where is you sqlite3 db, add a configuration into your elasticsearch.yml, like: 37 | 38 | ik_analysis_db_path: /opt/ik/dictionary.db 39 | 40 | PS: you can download my dictionary.db from https://github.com/zacker330/es-ik-sqlite3-dictionary 41 | 42 | 43 | - get in you elasticsearch folder then install plugin: 44 | 45 | ./bin/plugin -i ik-analysis -u https://github.com/zacker330/es-ik-plugin-sqlite3-release/raw/master/es-ik-sqlite3-1.0.1.zip 46 | 47 | - test your configuration: 48 | 49 | 1. create songs index 50 | 51 | curl -X PUT -H "Cache-Control: no-cache" -d '{ 52 | "settings":{ 53 | "index":{ 54 | "number_of_shards":1, 55 | "number_of_replicas": 1 56 | } 57 | } 58 | }' 'http://localhost:9200/songs/' 59 | 60 | 2. create map for songs/song 61 | 62 | curl -X PUT -H "Cache-Control: no-cache" -d '{ 63 | "song": { 64 | "_source": {"enabled": true}, 65 | "_all": { 66 | "indexAnalyzer": "ik_analysis", 67 | "searchAnalyzer": "ik_analysis", 68 | "term_vector": "no", 69 | "store": "true" 70 | }, 71 | "properties":{ 72 | "title":{ 73 | "type": "string", 74 | "store": "yes", 75 | "indexAnalyzer": "ik_analysis", 76 | "searchAnalyzer": "ik_analysis", 77 | "include_in_all": "true" 78 | } 79 | } 80 | 81 | } 82 | } 83 | ' 'http://localhost:9200/songs/_mapping/song' 84 | 85 | 3. test it 86 | 87 | curl -X POST -d '林夕为我们作词' 'http://localhost:9200/songs/_analyze?analyzer=ik_analysis' 88 | 89 | response: 90 | {"tokens":[{"token":"林夕","start_offset":0,"end_offset":2,"type":"CN_WORD","position":1},{"token":"作词","start_offset":5,"end_offset":7,"type":"CN_WORD","position":2}]} 91 | 92 | # Create a empty sqlite3 db for es-ik-sqlite3 93 | 94 | 1. create database 95 | 96 | sqlite3 dictionary.db 97 | 98 | 2. create tables 99 | 100 | CREATE TABLE main_dictionary(term TEXT NOT NULL,unique(term)); 101 | CREATE TABLE quantifier_dictionary(term TEXT NOT NULL,unique(term)); 102 | CREATE TABLE stopword_dictionary(term TEXT NOT NULL,unique(term)); 103 | 104 | 105 | 617052 records ~= 30MB db file 106 | 107 | 108 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | repositories { 3 | jcenter() 4 | } 5 | dependencies { 6 | classpath 'com.bmuschko:gradle-nexus-plugin:2.3.1' 7 | } 8 | } 9 | 10 | //apply plugin: 'checkstyle' 11 | 12 | allprojects { 13 | apply plugin: 'idea' 14 | apply plugin: 'com.bmuschko.nexus' 15 | } 16 | 17 | subprojects { 18 | apply plugin: 'java' 19 | apply plugin: 'distribution' 20 | 21 | 22 | sourceCompatibility = 1.7 23 | version = '1.0' 24 | 25 | repositories { 26 | mavenCentral() 27 | } 28 | 29 | distZip { 30 | exclude("**/*-javadoc.jar") 31 | exclude("**/*-tests.jar") 32 | exclude("**/*-sources.jar") 33 | } 34 | 35 | 36 | test { 37 | // enable TestNG support (default is JUnit) 38 | 39 | // show standard out and standard error of the test JVM(s) on the console 40 | testLogging.showStandardStreams = true 41 | 42 | // set heap size for the test JVM(s) 43 | minHeapSize = "128m" 44 | maxHeapSize = "1024m" 45 | 46 | // set JVM arguments for the test JVM(s) 47 | jvmArgs '-XX:MaxPermSize=256m' 48 | 49 | // listen to events in the test execution lifecycle 50 | beforeTest { descriptor -> 51 | logger.lifecycle("Running test: " + descriptor) 52 | } 53 | 54 | // listen to standard out and standard error of the test JVM(s) 55 | // onOutput { descriptor, event -> 56 | // logger.lifecycle("Test: " + descriptor + " produced standard out/err: " + event.message ) 57 | // } 58 | } 59 | } 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /es-ik-sqlite3/build.gradle: -------------------------------------------------------------------------------- 1 | group = "io.github.zacker330.es" 2 | archivesBaseName = "es-ik-sqlite3" 3 | version = "1.0.1" 4 | 5 | ext { 6 | LUCENCE_VERSION = '4.10.4' 7 | ELASTICSEARCH_VERSION = '1.6.0' 8 | } 9 | 10 | dependencies { 11 | 12 | compile project(':ik-analysis-core') 13 | compile project(':ik-analysis-es-plugin') 14 | compile( 15 | "org.elasticsearch:elasticsearch:$ELASTICSEARCH_VERSION", 16 | "org.apache.lucene:lucene-core:$LUCENCE_VERSION", 17 | "org.apache.lucene:lucene-queryparser:$LUCENCE_VERSION", 18 | "org.apache.lucene:lucene-analyzers-common:$LUCENCE_VERSION", 19 | files('libs/sqlite-jdbc-3.8.10.1.jar') 20 | ) 21 | runtime('ch.qos.logback:logback-classic:1.1.3') 22 | 23 | testCompile('com.google.guava:guava:18.0') 24 | testCompile('commons-dbutils:commons-dbutils:1.6') 25 | testCompile('junit:junit:4.12', 26 | "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.1.14", 27 | "org.apache.lucene:lucene-test-framework:$LUCENCE_VERSION" 28 | ) 29 | testCompile project(':ik-analysis-es-plugin') 30 | 31 | } 32 | 33 | modifyPom { 34 | project { 35 | name 'es-ik' 36 | description 'Kind of Chinese Analysis for Elasticsearch' 37 | url 'https://github.com/zacker330/es-ik' 38 | inceptionYear '2015' 39 | 40 | scm { 41 | url 'https://github.com/zacker330/es-ik' 42 | connection 'scm:https://github.com/zacker330/es-ik.git' 43 | developerConnection 'scm:git@github.com:zacker330/es-ik.git' 44 | } 45 | 46 | licenses { 47 | license { 48 | name 'The Apache Software License, Version 2.0' 49 | url 'http://www.apache.org/licenses/LICENSE-2.0.txt' 50 | distribution 'repo' 51 | } 52 | } 53 | 54 | developers { 55 | developer { 56 | id 'zacker330' 57 | name 'Jack' 58 | email 'zacker330@gmail.com' 59 | } 60 | } 61 | } 62 | } 63 | 64 | extraArchive { 65 | sources = true 66 | tests = true 67 | javadoc = true 68 | } 69 | 70 | distributions { 71 | main { 72 | baseName = 'es-ik-sqlite3' 73 | contents { 74 | from { "build/libs/" } 75 | from { "libs/" } 76 | from { project(":ik-analysis-core").buildDir.path + '/libs/' } 77 | from { project(":ik-analysis-es-plugin").buildDir.path + '/libs/' } 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /es-ik-sqlite3/libs/sqlite-jdbc-3.8.10.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zacker330/es-ik/4fc353df3a1b1d891a5501a48c2e23a96e042383/es-ik-sqlite3/libs/sqlite-jdbc-3.8.10.1.jar -------------------------------------------------------------------------------- /es-ik-sqlite3/src/main/java/io/github/zacker330/es/ik/es/ik/analyzer/Sqlite3Configuration.java: -------------------------------------------------------------------------------- 1 | package io.github.zacker330.es.ik.es.ik.analyzer; 2 | 3 | import org.elasticsearch.common.logging.ESLogger; 4 | import org.elasticsearch.common.logging.ESLoggerFactory; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.Index; 8 | import org.elasticsearch.index.analysis.ik.spi.Configuration; 9 | import org.elasticsearch.index.settings.IndexSettings; 10 | 11 | import java.sql.*; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | public class Sqlite3Configuration implements Configuration { 16 | 17 | private final ESLogger logger = ESLoggerFactory.getLogger(Sqlite3Configuration.class.getName()); 18 | 19 | private List mainDictionary; 20 | private List quantifierDictionary; 21 | private List stopWordDictionary; 22 | 23 | 24 | private boolean smartMode = true; 25 | 26 | public Sqlite3Configuration() { 27 | } 28 | 29 | private Sqlite3Configuration(String dbPath) { 30 | if (dbPath == null || "".equals(dbPath.trim())) { 31 | logger.error("dbPath is required!"); 32 | throw new IllegalArgumentException(); 33 | } 34 | 35 | 36 | mainDictionary = new ArrayList(); 37 | quantifierDictionary = new ArrayList(); 38 | stopWordDictionary = new ArrayList(); 39 | Connection connection = null; 40 | Statement statement = null; 41 | 42 | try { 43 | Class.forName("org.sqlite.JDBC"); 44 | connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath); 45 | statement = connection.createStatement(); 46 | statement.setQueryTimeout(30); 47 | ResultSet mainResult = statement.executeQuery("select * from main_dictionary"); 48 | while (mainResult.next()) { 49 | String term = mainResult.getString("term"); 50 | if (term == null || "".equals(term.trim())) { 51 | continue; 52 | } 53 | mainDictionary.add(term.toCharArray()); 54 | } 55 | 56 | ResultSet stopWordResult = statement.executeQuery("select * from stopword_dictionary"); 57 | while (stopWordResult.next()) { 58 | String term = stopWordResult.getString("term"); 59 | if (term == null || "".equals(term.trim())) { 60 | continue; 61 | } 62 | stopWordDictionary.add(term.toCharArray()); 63 | } 64 | 65 | ResultSet quantifierResult = statement.executeQuery("select * from quantifier_dictionary"); 66 | while (quantifierResult.next()) { 67 | String term = quantifierResult.getString("term"); 68 | if (term == null || "".equals(term.trim())) { 69 | continue; 70 | } 71 | quantifierDictionary.add(term.toCharArray()); 72 | } 73 | 74 | } catch (SQLException e) { 75 | logger.error("there's sql error", e); 76 | throw new RuntimeException(e); 77 | } catch (ClassNotFoundException e) { 78 | logger.error("not found sqlite3 jdbc", e); 79 | throw new RuntimeException(e); 80 | } finally { 81 | try { 82 | if (statement != null) { 83 | statement.close(); 84 | statement = null; 85 | } 86 | if (connection != null) { 87 | connection.close(); 88 | connection = null; 89 | } 90 | } catch (SQLException e) { 91 | logger.error("can't close jdbc connection", e); 92 | throw new RuntimeException(e); 93 | } 94 | } 95 | } 96 | 97 | public static Sqlite3Configuration smartModeSqlite3Configure(String dbPath) { 98 | Sqlite3Configuration sqlite3Configure = new Sqlite3Configuration(dbPath); 99 | sqlite3Configure.setSmartMode(true); 100 | return sqlite3Configure; 101 | } 102 | 103 | 104 | /** 105 | * 返回useSmart标志位 106 | * isSmartMode =true ,分词器使用智能切分策略, =false则使用细粒度切分 107 | * 108 | * @return isSmartMode 109 | */ 110 | public boolean isSmartMode() { 111 | return smartMode; 112 | } 113 | 114 | /** 115 | * 设置useSmart标志位 116 | * isSmartMode =true ,分词器使用智能切分策略, =false则使用细粒度切分 117 | * 118 | * @param smartMode 119 | */ 120 | public void setSmartMode(boolean smartMode) { 121 | this.smartMode = smartMode; 122 | } 123 | 124 | @Override 125 | public List getMainDictionary() { 126 | return mainDictionary; 127 | } 128 | 129 | @Override 130 | public List getStopWordDictionary() { 131 | return stopWordDictionary; 132 | } 133 | 134 | @Override 135 | public List getQuantifierDictionary() { 136 | return quantifierDictionary; 137 | } 138 | 139 | 140 | @Override 141 | public Configuration init(Index index, @IndexSettings Settings indexSettings, Environment env, String name, Settings settings) { 142 | return Sqlite3Configuration.smartModeSqlite3Configure(env.settings().get("ik_analysis_db_path")); 143 | } 144 | } 145 | 146 | 147 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/main/resources/META-INF/services/org.elasticsearch.index.analysis.ik.spi.Configuration: -------------------------------------------------------------------------------- 1 | io.github.zacker330.es.ik.es.ik.analyzer.Sqlite3Configuration 2 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/java/io/github/zacker330/es/ik/AbstractIntegrationTest.java: -------------------------------------------------------------------------------- 1 | package io.github.zacker330.es.ik; 2 | 3 | import com.google.common.base.Function; 4 | import org.apache.commons.dbutils.QueryRunner; 5 | import org.junit.AfterClass; 6 | import org.junit.Assert; 7 | import org.junit.BeforeClass; 8 | import org.wltea.analyzer.IKAnalzyerTest; 9 | 10 | import java.io.*; 11 | import java.sql.SQLException; 12 | 13 | public abstract class AbstractIntegrationTest { 14 | 15 | public final static String dbPath = AbstractIntegrationTest.class.getResource(".") + "dictionary.db"; 16 | 17 | @BeforeClass 18 | public static void prepareDatabase() throws IOException { 19 | 20 | 21 | if (new File(dbPath).exists()) { 22 | Assert.assertTrue(new File(dbPath).delete()); 23 | } 24 | 25 | Assert.assertTrue(runSQL(dbPath, "CREATE TABLE IF NOT EXISTS main_dictionary(term TEXT NOT NULL,unique(term));")); 26 | Assert.assertTrue(runSQL(dbPath, "CREATE TABLE IF NOT EXISTS stopword_dictionary(term TEXT NOT NULL,unique(term));")); 27 | Assert.assertTrue(runSQL(dbPath, "CREATE TABLE IF NOT EXISTS quantifier_dictionary(term TEXT NOT NULL,unique(term));")); 28 | // 29 | insertTerm("INSERT OR IGNORE INTO quantifier_dictionary values(?);", new IKAnalzyerTest().getClass().getClassLoader().getResourceAsStream("./quantifierDic.properties")); 30 | insertTerm("INSERT OR IGNORE INTO stopword_dictionary values(?);", new IKAnalzyerTest().getClass().getClassLoader().getResourceAsStream("./stopwordDic.properties")); 31 | insertTerm("INSERT OR IGNORE INTO main_dictionary values(?);", new IKAnalzyerTest().getClass().getClassLoader().getResourceAsStream("./mainDic.properties")); 32 | } 33 | 34 | @AfterClass 35 | public static void cleanDatabase() { 36 | if (new File(dbPath).exists()) { 37 | Assert.assertTrue(new File(dbPath).delete()); 38 | } 39 | } 40 | 41 | private static void insertTerm(String sql, InputStream dataLineByLineInputStream) throws IOException { 42 | readAndProcessTextInLine(dataLineByLineInputStream, new AbstractIntegrationTest.SQLRunFunction(dbPath, sql)); 43 | } 44 | 45 | private static void readAndProcessTextInLine(InputStream inputStream, Function function) throws IOException { 46 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"), 512); 47 | 48 | String line = null; 49 | do { 50 | line = bufferedReader.readLine(); 51 | if (line != null && !"".equals(line.trim())) { 52 | if (!function.apply(line.trim().toLowerCase())) { 53 | break; 54 | } 55 | } 56 | } while (line != null); 57 | } 58 | 59 | 60 | private static class SQLRunFunction implements Function { 61 | 62 | private String dbPath; 63 | private String sql; 64 | 65 | public SQLRunFunction(String dbPath, String sql) { 66 | this.dbPath = dbPath; 67 | this.sql = sql; 68 | } 69 | 70 | @Override 71 | public Boolean apply(String arg) { 72 | return runSQL(dbPath, sql, arg); 73 | } 74 | } 75 | 76 | private static boolean runSQL(String dbPath, String sql, Object... args) { 77 | QueryRunner queryRunner = new QueryRunner(new DictionaryDataSource(dbPath)); 78 | try { 79 | System.out.println("SQL: " + sql); 80 | int result = queryRunner.update(sql, args); 81 | } catch (SQLException e) { 82 | System.out.println(e); 83 | return false; 84 | } 85 | return true; 86 | } 87 | 88 | 89 | } 90 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/java/io/github/zacker330/es/ik/DictionaryDatasource.java: -------------------------------------------------------------------------------- 1 | package io.github.zacker330.es.ik; 2 | 3 | import javax.sql.DataSource; 4 | import java.io.PrintWriter; 5 | import java.sql.Connection; 6 | import java.sql.DriverManager; 7 | import java.sql.SQLException; 8 | import java.sql.SQLFeatureNotSupportedException; 9 | import java.util.logging.Logger; 10 | 11 | public class DictionaryDataSource implements DataSource { 12 | 13 | private String dbPath; 14 | 15 | public DictionaryDataSource(String dbPath) { 16 | this.dbPath = dbPath; 17 | } 18 | 19 | @Override 20 | public Connection getConnection() throws SQLException { 21 | try { 22 | Class.forName("org.sqlite.JDBC"); 23 | } catch (ClassNotFoundException e) { 24 | System.out.println(e); 25 | } 26 | return DriverManager.getConnection("jdbc:sqlite:" + dbPath); 27 | } 28 | 29 | @Deprecated 30 | @Override 31 | public Connection getConnection(String username, String password) throws SQLException { 32 | return null; 33 | } 34 | 35 | @Deprecated 36 | @Override 37 | public PrintWriter getLogWriter() throws SQLException { 38 | return null; 39 | } 40 | 41 | @Deprecated 42 | 43 | @Override 44 | public void setLogWriter(PrintWriter out) throws SQLException { 45 | 46 | } 47 | 48 | @Deprecated 49 | 50 | @Override 51 | public void setLoginTimeout(int seconds) throws SQLException { 52 | 53 | } 54 | 55 | @Override 56 | public int getLoginTimeout() throws SQLException { 57 | return 100; 58 | } 59 | 60 | @Deprecated 61 | 62 | public Logger getParentLogger() throws SQLFeatureNotSupportedException { 63 | return null; 64 | } 65 | 66 | @Deprecated 67 | 68 | @Override 69 | public T unwrap(Class iface) throws SQLException { 70 | return null; 71 | } 72 | 73 | 74 | @Override 75 | public boolean isWrapperFor(Class iface) throws SQLException { 76 | return false; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/java/org/wltea/analyzer/IKAnalzyerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package org.wltea.analyzer; 3 | 4 | import io.github.zacker330.es.ik.AbstractIntegrationTest; 5 | import io.github.zacker330.es.ik.es.ik.analyzer.Sqlite3Configuration; 6 | import org.apache.lucene.analysis.Analyzer; 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 11 | import org.junit.Assert; 12 | import org.junit.Test; 13 | import org.wltea.analyzer.configuration.DictionaryConfiguration; 14 | import org.wltea.analyzer.lucene.IKAnalyzer; 15 | 16 | import java.io.IOException; 17 | import java.io.StringReader; 18 | 19 | /** 20 | * 使用IKAnalyzer进行分词的演示 21 | * 2012-10-22 22 | */ 23 | public class IKAnalzyerTest extends AbstractIntegrationTest { 24 | 25 | private DictionaryConfiguration configuration; 26 | 27 | @Test 28 | public void testAnalyzer() { 29 | //构建IK分词器,使用smart分词模式 30 | 31 | configuration = Sqlite3Configuration.smartModeSqlite3Configure(dbPath); 32 | Analyzer analyzer = new IKAnalyzer(configuration); 33 | 34 | //获取Lucene的TokenStream对象 35 | TokenStream tokenStream = null; 36 | try { 37 | tokenStream = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATAHELLO")); 38 | // ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too")); 39 | //获取词元位置属性 40 | OffsetAttribute offset = tokenStream.addAttribute(OffsetAttribute.class); 41 | //获取词元文本属性 42 | CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class); 43 | //获取词元文本属性 44 | TypeAttribute type = tokenStream.addAttribute(TypeAttribute.class); 45 | 46 | 47 | //重置TokenStream(重置StringReader) 48 | tokenStream.reset(); 49 | 50 | tokenStream.incrementToken(); 51 | Assert.assertEquals(0, offset.startOffset()); 52 | Assert.assertEquals(5, offset.endOffset()); 53 | Assert.assertEquals("ENGLISH", type.type()); 54 | Assert.assertEquals("world", term.toString()); 55 | 56 | 57 | tokenStream.incrementToken(); 58 | Assert.assertEquals(10, offset.startOffset()); 59 | Assert.assertEquals(14, offset.endOffset()); 60 | Assert.assertEquals("ENGLISH", type.type()); 61 | Assert.assertEquals("html", term.toString()); 62 | 63 | 64 | tokenStream.incrementToken(); 65 | Assert.assertEquals(15, offset.startOffset()); 66 | Assert.assertEquals(19, offset.endOffset()); 67 | Assert.assertEquals("ENGLISH", type.type()); 68 | Assert.assertEquals("data", term.toString()); 69 | 70 | tokenStream.incrementToken(); 71 | Assert.assertEquals(21, offset.startOffset()); 72 | Assert.assertEquals(25, offset.endOffset()); 73 | Assert.assertEquals("ENGLISH", type.type()); 74 | Assert.assertEquals("html", term.toString()); 75 | 76 | tokenStream.incrementToken(); 77 | Assert.assertEquals(26, offset.startOffset()); 78 | Assert.assertEquals(31, offset.endOffset()); 79 | Assert.assertEquals("ENGLISH", type.type()); 80 | Assert.assertEquals("hello", term.toString()); 81 | 82 | 83 | //关闭TokenStream(关闭StringReader) 84 | tokenStream.end(); 85 | 86 | } catch (IOException e) { 87 | e.printStackTrace(); 88 | } finally { 89 | //释放TokenStream的所有资源 90 | if (tokenStream != null) { 91 | try { 92 | tokenStream.close(); 93 | } catch (IOException e) { 94 | e.printStackTrace(); 95 | } 96 | } 97 | } 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/java/org/wltea/analyzer/LuceneIndexAndSearchTest.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer; 2 | 3 | import io.github.zacker330.es.ik.AbstractIntegrationTest; 4 | import io.github.zacker330.es.ik.es.ik.analyzer.Sqlite3Configuration; 5 | import org.apache.lucene.analysis.Analyzer; 6 | import org.apache.lucene.document.Document; 7 | import org.apache.lucene.document.Field; 8 | import org.apache.lucene.document.StringField; 9 | import org.apache.lucene.document.TextField; 10 | import org.apache.lucene.index.*; 11 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 12 | import org.apache.lucene.queryparser.classic.ParseException; 13 | import org.apache.lucene.queryparser.classic.QueryParser; 14 | import org.apache.lucene.search.IndexSearcher; 15 | import org.apache.lucene.search.Query; 16 | import org.apache.lucene.search.TopDocs; 17 | import org.apache.lucene.store.Directory; 18 | import org.apache.lucene.store.LockObtainFailedException; 19 | import org.apache.lucene.store.RAMDirectory; 20 | import org.apache.lucene.util.Version; 21 | import org.junit.Assert; 22 | import org.junit.Ignore; 23 | import org.junit.Test; 24 | import org.wltea.analyzer.lucene.IKAnalyzer; 25 | 26 | import java.io.IOException; 27 | 28 | public class LuceneIndexAndSearchTest extends AbstractIntegrationTest { 29 | 30 | 31 | @Test 32 | public void testLucenceIndex() { 33 | //Lucene Document的域名 34 | String fieldName = "text"; 35 | //检索内容 36 | String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; 37 | 38 | //实例化IKAnalyzer分词器 39 | Analyzer analyzer = new IKAnalyzer(Sqlite3Configuration.smartModeSqlite3Configure(dbPath)); 40 | 41 | Directory directory = null; 42 | IndexWriter iwriter = null; 43 | IndexReader ireader = null; 44 | IndexSearcher isearcher = null; 45 | try { 46 | //建立内存索引对象 47 | directory = new RAMDirectory(); 48 | 49 | //配置IndexWriterConfig 50 | IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer); 51 | iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); 52 | iwriter = new IndexWriter(directory, iwConfig); 53 | //写入索引 54 | Document doc = new Document(); 55 | doc.add(new StringField("ID", "10000", Field.Store.YES)); 56 | doc.add(new TextField(fieldName, text, Field.Store.YES)); 57 | iwriter.addDocument(doc); 58 | iwriter.close(); 59 | 60 | 61 | //搜索过程********************************** 62 | //实例化搜索器 63 | ireader = DirectoryReader.open(directory); 64 | isearcher = new IndexSearcher(ireader); 65 | 66 | String keyword = "中文分词工具包"; 67 | //使用QueryParser查询分析器构造Query对象 68 | QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); 69 | qp.setDefaultOperator(QueryParser.AND_OPERATOR); 70 | Query query = qp.parse(keyword); 71 | 72 | Assert.assertEquals(query.toString(), "+text:中文 +text:分词 +text:工具包"); 73 | 74 | //搜索相似度最高的5条记录 75 | TopDocs topDocs = isearcher.search(query, 5); 76 | 77 | 78 | Assert.assertEquals(topDocs.totalHits, 1); 79 | Assert.assertEquals(isearcher.doc(topDocs.scoreDocs[0].doc).toString(), "Document stored,indexed,tokenized>"); 80 | 81 | } catch (CorruptIndexException e) { 82 | e.printStackTrace(); 83 | } catch (LockObtainFailedException e) { 84 | e.printStackTrace(); 85 | } catch (IOException e) { 86 | e.printStackTrace(); 87 | } catch (ParseException e) { 88 | e.printStackTrace(); 89 | } finally { 90 | if (ireader != null) { 91 | try { 92 | ireader.close(); 93 | } catch (IOException e) { 94 | e.printStackTrace(); 95 | } 96 | } 97 | if (directory != null) { 98 | try { 99 | directory.close(); 100 | } catch (IOException e) { 101 | e.printStackTrace(); 102 | } 103 | } 104 | } 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/resources/database.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE main_dictionary(term TEXT NOT NULL,unique(term)); 2 | 3 | CREATE TABLE stopword_dictionary(term TEXT NOT NULL,unique(term)); 4 | 5 | CREATE TABLE quantifier_dictionary(term TEXT NOT NULL,unique(term)); 6 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/resources/logback-test.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | %d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/resources/mainDic.properties: -------------------------------------------------------------------------------- 1 | 这是 2 | 中文 3 | 分词 4 | 例子 5 | 结合 6 | 词典 7 | 文法 8 | 开源 9 | 工具包 10 | 使用 11 | 全新 12 | 迭代 13 | 最细 14 | 正向 15 | 粒度 16 | 切分 17 | 算法 18 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/resources/quantifierDic.properties: -------------------------------------------------------------------------------- 1 | 丈 2 | 下 3 | 世 4 | 世纪 5 | 两 6 | 个 7 | 中 8 | 串 9 | 亩 10 | 人 11 | 介 12 | 付 13 | 代 14 | 件 15 | 任 16 | 份 17 | 伏 18 | 伙 19 | 位 20 | 位数 21 | 例 22 | 倍 23 | 像素 24 | 元 25 | 克 26 | 克拉 27 | 公亩 28 | 公克 29 | 公分 30 | 公升 31 | 公尺 32 | 公担 33 | 公斤 34 | 公里 35 | 公顷 36 | 具 37 | 册 38 | 出 39 | 刀 40 | 分 41 | 分钟 42 | 划 43 | 列 44 | 则 45 | 刻 46 | 剂 47 | 剑 48 | 副 49 | 加仑 50 | 勺 51 | 包 52 | 匙 53 | 匹 54 | 区 55 | 千克 56 | 千米 57 | 升 58 | 卷 59 | 厅 60 | 厘 61 | 双 62 | 发 63 | 口 64 | 句 65 | 只 66 | 台 67 | 叶 68 | 号 69 | 名 70 | 吨 71 | 听 72 | 员 73 | 周 74 | 周年 75 | 品 76 | 回 77 | 团 78 | 圆 79 | 圈 80 | 地 81 | 场 82 | 块 83 | 坪 84 | 堆 85 | 声 86 | 壶 87 | 处 88 | 夜 89 | 大 90 | 天 91 | 头 92 | 套 93 | 女 94 | 孔 95 | 字 96 | 宗 97 | 室 98 | 家 99 | 寸 100 | 对 101 | 封 102 | 尊 103 | 小时 104 | 尺 105 | 尾 106 | 局 107 | 层 108 | 届 109 | 岁 110 | 师 111 | 帧 112 | 幅 113 | 幕 114 | 幢 115 | 平方 116 | 平方公尺 117 | 平方公里 118 | 平方分米 119 | 平方厘米 120 | 平方码 121 | 平方米 122 | 平方英寸 123 | 平方英尺 124 | 平方英里 125 | 平米 126 | 年 127 | 年代 128 | 年级 129 | 度 130 | 座 131 | 式 132 | 引 133 | 张 134 | 成 135 | 战 136 | 截 137 | 户 138 | 房 139 | 所 140 | 扇 141 | 手 142 | 打 143 | 批 144 | 把 145 | 折 146 | 担 147 | 拉 148 | 拍 149 | 招 150 | 拨 151 | 拳 152 | 指 153 | 掌 154 | 排 155 | 撮 156 | 支 157 | 文 158 | 斗 159 | 斤 160 | 方 161 | 族 162 | 日 163 | 时 164 | 曲 165 | 月 166 | 月份 167 | 期 168 | 本 169 | 朵 170 | 村 171 | 束 172 | 条 173 | 来 174 | 杯 175 | 枚 176 | 枝 177 | 枪 178 | 架 179 | 柄 180 | 柜 181 | 栋 182 | 栏 183 | 株 184 | 样 185 | 根 186 | 格 187 | 案 188 | 桌 189 | 档 190 | 桩 191 | 桶 192 | 梯 193 | 棵 194 | 楼 195 | 次 196 | 款 197 | 步 198 | 段 199 | 毛 200 | 毫 201 | 池 202 | 洲 203 | 派 204 | 海里 205 | 滴 206 | 炮 207 | 点 208 | 点钟 209 | 片 210 | 版 211 | 环 212 | 班 213 | 瓣 214 | 瓶 215 | 生 216 | 男 217 | 画 218 | 界 219 | 盆 220 | 盎司 221 | 盏 222 | 盒 223 | 盘 224 | 相 225 | 眼 226 | 石 227 | 码 228 | 碗 229 | 碟 230 | 磅 231 | 种 232 | 科 233 | 秒 234 | 秒钟 235 | 窝 236 | 立方公尺 237 | 立方分米 238 | 立方厘米 239 | 立方码 240 | 立方米 241 | 立方英寸 242 | 立方英尺 243 | 站 244 | 章 245 | 笔 246 | 等 247 | 筐 248 | 筒 249 | 箱 250 | 篇 251 | 篓 252 | 篮 253 | 簇 254 | 米 255 | 类 256 | 粒 257 | 级 258 | 组 259 | 维 260 | 缕 261 | 缸 262 | 罐 263 | 网 264 | 群 265 | 股 266 | 脚 267 | 船 268 | 艇 269 | 艘 270 | 色 271 | 节 272 | 英亩 273 | 英寸 274 | 英尺 275 | 英里 276 | 行 277 | 袋 278 | 角 279 | 言 280 | 课 281 | 起 282 | 趟 283 | 路 284 | 车 285 | 转 286 | 轮 287 | 辆 288 | 辈 289 | 连 290 | 通 291 | 遍 292 | 部 293 | 里 294 | 重 295 | 针 296 | 钟 297 | 钱 298 | 锅 299 | 门 300 | 间 301 | 队 302 | 阶段 303 | 隅 304 | 集 305 | 页 306 | 顶 307 | 顷 308 | 项 309 | 顿 310 | 颗 311 | 餐 312 | 首 313 | -------------------------------------------------------------------------------- /es-ik-sqlite3/src/test/resources/stopwordDic.properties: -------------------------------------------------------------------------------- 1 | a 2 | an 3 | and 4 | are 5 | as 6 | at 7 | be 8 | but 9 | by 10 | for 11 | if 12 | in 13 | into 14 | is 15 | it 16 | no 17 | not 18 | of 19 | on 20 | or 21 | such 22 | that 23 | the 24 | their 25 | then 26 | there 27 | these 28 | they 29 | this 30 | to 31 | was 32 | will 33 | with 34 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zacker330/es-ik/4fc353df3a1b1d891a5501a48c2e23a96e042383/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Thu Jun 18 17:26:55 GMT+08:00 2015 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.1-all.zip 7 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # For Cygwin, ensure paths are in UNIX format before anything is touched. 46 | if $cygwin ; then 47 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 48 | fi 49 | 50 | # Attempt to set APP_HOME 51 | # Resolve links: $0 may be a link 52 | PRG="$0" 53 | # Need this for relative symlinks. 54 | while [ -h "$PRG" ] ; do 55 | ls=`ls -ld "$PRG"` 56 | link=`expr "$ls" : '.*-> \(.*\)$'` 57 | if expr "$link" : '/.*' > /dev/null; then 58 | PRG="$link" 59 | else 60 | PRG=`dirname "$PRG"`"/$link" 61 | fi 62 | done 63 | SAVED="`pwd`" 64 | cd "`dirname \"$PRG\"`/" >&- 65 | APP_HOME="`pwd -P`" 66 | cd "$SAVED" >&- 67 | 68 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 69 | 70 | # Determine the Java command to use to start the JVM. 71 | if [ -n "$JAVA_HOME" ] ; then 72 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 73 | # IBM's JDK on AIX uses strange locations for the executables 74 | JAVACMD="$JAVA_HOME/jre/sh/java" 75 | else 76 | JAVACMD="$JAVA_HOME/bin/java" 77 | fi 78 | if [ ! -x "$JAVACMD" ] ; then 79 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 80 | 81 | Please set the JAVA_HOME variable in your environment to match the 82 | location of your Java installation." 83 | fi 84 | else 85 | JAVACMD="java" 86 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 87 | 88 | Please set the JAVA_HOME variable in your environment to match the 89 | location of your Java installation." 90 | fi 91 | 92 | # Increase the maximum file descriptors if we can. 93 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 94 | MAX_FD_LIMIT=`ulimit -H -n` 95 | if [ $? -eq 0 ] ; then 96 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 97 | MAX_FD="$MAX_FD_LIMIT" 98 | fi 99 | ulimit -n $MAX_FD 100 | if [ $? -ne 0 ] ; then 101 | warn "Could not set maximum file descriptor limit: $MAX_FD" 102 | fi 103 | else 104 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 105 | fi 106 | fi 107 | 108 | # For Darwin, add options to specify how the application appears in the dock 109 | if $darwin; then 110 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 111 | fi 112 | 113 | # For Cygwin, switch paths to Windows format before running java 114 | if $cygwin ; then 115 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 116 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 158 | function splitJvmOpts() { 159 | JVM_OPTS=("$@") 160 | } 161 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 162 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 163 | 164 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 165 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windowz variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /ik-analysis-core/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | 3 | group = "io.github.zacker330.es" 4 | archivesBaseName = "ik-analysis-core" 5 | version = "1.0.0" 6 | 7 | repositories { 8 | mavenCentral() 9 | } 10 | 11 | dependencies { 12 | runtime('ch.qos.logback:logback-classic:1.1.3') 13 | testCompile('junit:junit:4.12') 14 | } 15 | 16 | 17 | modifyPom { 18 | project { 19 | name 'es-ik' 20 | description 'Kind of Chinese Analysis for Elasticsearch' 21 | url 'https://github.com/zacker330/es-ik' 22 | inceptionYear '2015' 23 | 24 | scm { 25 | url 'https://github.com/zacker330/es-ik' 26 | connection 'scm:https://github.com/zacker330/es-ik.git' 27 | developerConnection 'scm:git@github.com:zacker330/es-ik.git' 28 | } 29 | 30 | licenses { 31 | license { 32 | name 'The Apache Software License, Version 2.0' 33 | url 'http://www.apache.org/licenses/LICENSE-2.0.txt' 34 | distribution 'repo' 35 | } 36 | } 37 | 38 | developers { 39 | developer { 40 | id 'zacker330' 41 | name 'Jack' 42 | email 'zacker330@gmail.com' 43 | } 44 | } 45 | } 46 | 47 | } 48 | 49 | javadoc { 50 | source = sourceSets.main.allJava 51 | classpath = configurations.compile 52 | } 53 | 54 | extraArchive { 55 | sources = true 56 | tests = true 57 | javadoc = true 58 | } 59 | 60 | -------------------------------------------------------------------------------- /ik-analysis-core/config/checkstyle/checkstyle.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/configuration/DictionaryConfiguration.java: -------------------------------------------------------------------------------- 1 | 2 | package org.wltea.analyzer.configuration; 3 | 4 | import java.util.List; 5 | 6 | public interface DictionaryConfiguration { 7 | 8 | 9 | 10 | public boolean isSmartMode(); 11 | 12 | public void setSmartMode(boolean useSmart); 13 | 14 | List getMainDictionary(); 15 | 16 | List getStopWordDictionary(); 17 | 18 | List getQuantifierDictionary(); 19 | } 20 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.core; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.LinkedList; 8 | import java.util.Map; 9 | import java.util.Set; 10 | 11 | import org.wltea.analyzer.configuration.DictionaryConfiguration; 12 | import org.wltea.analyzer.dic.Dictionary; 13 | 14 | class AnalyzeContext { 15 | 16 | //默认缓冲区大小 17 | private static final int BUFF_SIZE = 3072; 18 | //缓冲区耗尽的临界值 19 | private static final int BUFF_EXHAUST_CRITICAL = 48; 20 | 21 | 22 | //字符窜读取缓冲 23 | private char[] segmentBuff; 24 | //字符类型数组 25 | private int[] charTypes; 26 | 27 | 28 | //记录Reader内已分析的字串总长度 29 | //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 30 | private int buffOffset; 31 | //当前缓冲区位置指针 32 | private int cursor; 33 | //最近一次读入的,可处理的字串长度 34 | private int available; 35 | 36 | 37 | //子分词器锁 38 | //该集合非空,说明有子分词器在占用segmentBuff 39 | private Set buffLocker; 40 | 41 | //原始分词结果集合,未经歧义处理 42 | private QuickSortSet orgLexemes; 43 | //LexemePath位置索引表 44 | private Map pathMap; 45 | //最终分词结果集 46 | private LinkedList results; 47 | 48 | //分词器配置项 49 | private DictionaryConfiguration cfg; 50 | 51 | public AnalyzeContext(DictionaryConfiguration cfg){ 52 | this.cfg = cfg; 53 | this.segmentBuff = new char[BUFF_SIZE]; 54 | this.charTypes = new int[BUFF_SIZE]; 55 | this.buffLocker = new HashSet(); 56 | this.orgLexemes = new QuickSortSet(); 57 | this.pathMap = new HashMap(); 58 | this.results = new LinkedList(); 59 | } 60 | 61 | int getCursor(){ 62 | return this.cursor; 63 | } 64 | 65 | char[] getSegmentBuff(){ 66 | return this.segmentBuff; 67 | } 68 | 69 | char getCurrentChar(){ 70 | return this.segmentBuff[this.cursor]; 71 | } 72 | 73 | int getCurrentCharType(){ 74 | return this.charTypes[this.cursor]; 75 | } 76 | 77 | int getBufferOffset(){ 78 | return this.buffOffset; 79 | } 80 | 81 | int fillBuffer(Reader reader) throws IOException{ 82 | int readCount = 0; 83 | if(this.buffOffset == 0){ 84 | //首次读取reader 85 | readCount = reader.read(segmentBuff); 86 | }else{ 87 | int offset = this.available - this.cursor; 88 | if(offset > 0){ 89 | //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部 90 | System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset); 91 | readCount = offset; 92 | } 93 | //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分 94 | readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset); 95 | } 96 | //记录最后一次从Reader中读入的可用字符长度 97 | this.available = readCount; 98 | //重置当前指针 99 | this.cursor = 0; 100 | return readCount; 101 | } 102 | 103 | void initCursor(){ 104 | this.cursor = 0; 105 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 106 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 107 | } 108 | 109 | boolean moveCursor(){ 110 | if(this.cursor < this.available - 1){ 111 | this.cursor++; 112 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 113 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 114 | return true; 115 | }else{ 116 | return false; 117 | } 118 | } 119 | 120 | void lockBuffer(String segmenterName){ 121 | this.buffLocker.add(segmenterName); 122 | } 123 | 124 | void unlockBuffer(String segmenterName){ 125 | this.buffLocker.remove(segmenterName); 126 | } 127 | 128 | boolean isBufferLocked(){ 129 | return this.buffLocker.size() > 0; 130 | } 131 | 132 | boolean isBufferConsumed(){ 133 | return this.cursor == this.available - 1; 134 | } 135 | 136 | boolean needRefillBuffer(){ 137 | return this.available == BUFF_SIZE 138 | && this.cursor < this.available - 1 139 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL 140 | && !this.isBufferLocked(); 141 | } 142 | 143 | void markBufferOffset(){ 144 | this.buffOffset += this.cursor; 145 | } 146 | 147 | void addLexeme(Lexeme lexeme){ 148 | this.orgLexemes.addLexeme(lexeme); 149 | } 150 | 151 | void addLexemePath(LexemePath path){ 152 | if(path != null){ 153 | this.pathMap.put(path.getPathBegin(), path); 154 | } 155 | } 156 | 157 | 158 | QuickSortSet getOrgLexemes(){ 159 | return this.orgLexemes; 160 | } 161 | 162 | void processUnkownCJKChar(){ 163 | int index = 0; 164 | for( ; index < this.available ;){ 165 | //跳过标点符号等字符 166 | if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){ 167 | index++; 168 | continue; 169 | } 170 | //从pathMap找出对应index位置的LexemePath 171 | LexemePath path = this.pathMap.get(index); 172 | if(path != null){ 173 | //输出LexemePath中的lexeme到results集合 174 | Lexeme l = path.pollFirst(); 175 | while(l != null){ 176 | this.results.add(l); 177 | //将index移至lexeme后 178 | index = l.getBegin() + l.getLength(); 179 | l = path.pollFirst(); 180 | if(l != null){ 181 | //输出path内部,词元间遗漏的单字 182 | for(;index < l.getBegin();index++){ 183 | this.outputSingleCJK(index); 184 | } 185 | } 186 | } 187 | }else{//pathMap中找不到index对应的LexemePath 188 | //单字输出 189 | this.outputSingleCJK(index); 190 | index++; 191 | } 192 | } 193 | //清空当前的Map 194 | this.pathMap.clear(); 195 | } 196 | 197 | private void outputSingleCJK(int index){ 198 | if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){ 199 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR); 200 | this.results.add(singleCharLexeme); 201 | }else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){ 202 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK); 203 | this.results.add(singleCharLexeme); 204 | } 205 | } 206 | 207 | boolean hasNextResult(){ 208 | return !this.results.isEmpty(); 209 | } 210 | 211 | Lexeme getNextLexeme(){ 212 | //从结果集取出,并移除第一个Lexme 213 | Lexeme result = this.results.pollFirst(); 214 | while(result != null){ 215 | //数量词合并 216 | this.compound(result); 217 | if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ 218 | //是停止词继续取列表的下一个 219 | result = this.results.pollFirst(); 220 | }else{ 221 | //不是停止词, 生成lexeme的词元文本,输出 222 | result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength())); 223 | break; 224 | } 225 | } 226 | return result; 227 | } 228 | 229 | void reset(){ 230 | this.buffLocker.clear(); 231 | this.orgLexemes = new QuickSortSet(); 232 | this.available =0; 233 | this.buffOffset = 0; 234 | this.charTypes = new int[BUFF_SIZE]; 235 | this.cursor = 0; 236 | this.results.clear(); 237 | this.segmentBuff = new char[BUFF_SIZE]; 238 | this.pathMap.clear(); 239 | } 240 | 241 | private void compound(Lexeme result){ 242 | if(!this.cfg.isSmartMode()){ 243 | return ; 244 | } 245 | //数量词合并处理 246 | if(!this.results.isEmpty()){ 247 | 248 | if(Lexeme.TYPE_ARABIC == result.getLexemeType()){ 249 | Lexeme nextLexeme = this.results.peekFirst(); 250 | boolean appendOk = false; 251 | if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){ 252 | //合并英文数词+中文数词 253 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM); 254 | }else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){ 255 | //合并英文数词+中文量词 256 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); 257 | } 258 | if(appendOk){ 259 | //弹出 260 | this.results.pollFirst(); 261 | } 262 | } 263 | 264 | //可能存在第二轮合并 265 | if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){ 266 | Lexeme nextLexeme = this.results.peekFirst(); 267 | boolean appendOk = false; 268 | if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){ 269 | //合并中文数词+中文量词 270 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); 271 | } 272 | if(appendOk){ 273 | //弹出 274 | this.results.pollFirst(); 275 | } 276 | } 277 | 278 | } 279 | } 280 | 281 | } 282 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.core; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import org.wltea.analyzer.dic.Dictionary; 7 | import org.wltea.analyzer.dic.Hit; 8 | 9 | 10 | class CJKSegmenter implements ISegmenter { 11 | 12 | //子分词器标签 13 | static final String SEGMENTER_NAME = "CJK_SEGMENTER"; 14 | //待处理的分词hit队列 15 | private List tmpHits; 16 | 17 | 18 | CJKSegmenter(){ 19 | this.tmpHits = new LinkedList(); 20 | } 21 | 22 | /* (non-Javadoc) 23 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 24 | */ 25 | public void analyze(AnalyzeContext context) { 26 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){ 27 | 28 | //优先处理tmpHits中的hit 29 | if(!this.tmpHits.isEmpty()){ 30 | //处理词段队列 31 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); 32 | for(Hit hit : tmpArray){ 33 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 34 | if(hit.isMatch()){ 35 | //输出当前的词 36 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); 37 | context.addLexeme(newLexeme); 38 | 39 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 40 | this.tmpHits.remove(hit); 41 | } 42 | 43 | }else if(hit.isUnmatch()){ 44 | //hit不是词,移除 45 | this.tmpHits.remove(hit); 46 | } 47 | } 48 | } 49 | 50 | //********************************* 51 | //再对当前指针位置的字符进行单字匹配 52 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); 53 | if(singleCharHit.isMatch()){//首字成词 54 | //输出当前的词 55 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); 56 | context.addLexeme(newLexeme); 57 | 58 | //同时也是词前缀 59 | if(singleCharHit.isPrefix()){ 60 | //前缀匹配则放入hit列表 61 | this.tmpHits.add(singleCharHit); 62 | } 63 | }else if(singleCharHit.isPrefix()){//首字为词前缀 64 | //前缀匹配则放入hit列表 65 | this.tmpHits.add(singleCharHit); 66 | } 67 | 68 | 69 | }else{ 70 | //遇到CHAR_USELESS字符 71 | //清空队列 72 | this.tmpHits.clear(); 73 | } 74 | 75 | //判断缓冲区是否已经读完 76 | if(context.isBufferConsumed()){ 77 | //清空队列 78 | this.tmpHits.clear(); 79 | } 80 | 81 | //判断是否锁定缓冲区 82 | if(this.tmpHits.size() == 0){ 83 | context.unlockBuffer(SEGMENTER_NAME); 84 | 85 | }else{ 86 | context.lockBuffer(SEGMENTER_NAME); 87 | } 88 | } 89 | 90 | /* (non-Javadoc) 91 | * @see org.wltea.analyzer.core.ISegmenter#reset() 92 | */ 93 | public void reset() { 94 | //清空队列 95 | this.tmpHits.clear(); 96 | } 97 | 98 | } 99 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.core; 2 | 3 | import java.util.HashSet; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | import java.util.Set; 7 | 8 | import org.wltea.analyzer.dic.Dictionary; 9 | import org.wltea.analyzer.dic.Hit; 10 | 11 | class CN_QuantifierSegmenter implements ISegmenter{ 12 | 13 | //子分词器标签 14 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; 15 | 16 | //中文数词 17 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum 18 | private static Set ChnNumberChars = new HashSet(); 19 | static{ 20 | char[] ca = Chn_Num.toCharArray(); 21 | for(char nChar : ca){ 22 | ChnNumberChars.add(nChar); 23 | } 24 | } 25 | 26 | /* 27 | * 词元的开始位置, 28 | * 同时作为子分词器状态标识 29 | * 当start > -1 时,标识当前的分词器正在处理字符 30 | */ 31 | private int nStart; 32 | /* 33 | * 记录词元结束位置 34 | * end记录的是在词元中最后一个出现的合理的数词结束 35 | */ 36 | private int nEnd; 37 | 38 | //待处理的量词hit队列 39 | private List countHits; 40 | 41 | 42 | CN_QuantifierSegmenter(){ 43 | nStart = -1; 44 | nEnd = -1; 45 | this.countHits = new LinkedList(); 46 | } 47 | 48 | public void analyze(AnalyzeContext context) { 49 | //处理中文数词 50 | this.processCNumber(context); 51 | //处理中文量词 52 | this.processCount(context); 53 | 54 | //判断是否锁定缓冲区 55 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){ 56 | //对缓冲区解锁 57 | context.unlockBuffer(SEGMENTER_NAME); 58 | }else{ 59 | context.lockBuffer(SEGMENTER_NAME); 60 | } 61 | } 62 | 63 | 64 | public void reset() { 65 | nStart = -1; 66 | nEnd = -1; 67 | countHits.clear(); 68 | } 69 | 70 | private void processCNumber(AnalyzeContext context){ 71 | if(nStart == -1 && nEnd == -1){//初始状态 72 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 73 | && ChnNumberChars.contains(context.getCurrentChar())){ 74 | //记录数词的起始、结束位置 75 | nStart = context.getCursor(); 76 | nEnd = context.getCursor(); 77 | } 78 | }else{//正在处理状态 79 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 80 | && ChnNumberChars.contains(context.getCurrentChar())){ 81 | //记录数词的结束位置 82 | nEnd = context.getCursor(); 83 | }else{ 84 | //输出数词 85 | this.outputNumLexeme(context); 86 | //重置头尾指针 87 | nStart = -1; 88 | nEnd = -1; 89 | } 90 | } 91 | 92 | //缓冲区已经用完,还有尚未输出的数词 93 | if(context.isBufferConsumed()){ 94 | if(nStart != -1 && nEnd != -1){ 95 | //输出数词 96 | outputNumLexeme(context); 97 | //重置头尾指针 98 | nStart = -1; 99 | nEnd = -1; 100 | } 101 | } 102 | } 103 | 104 | private void processCount(AnalyzeContext context){ 105 | // 判断是否需要启动量词扫描 106 | if(!this.needCountScan(context)){ 107 | return; 108 | } 109 | 110 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){ 111 | 112 | //优先处理countHits中的hit 113 | if(!this.countHits.isEmpty()){ 114 | //处理词段队列 115 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); 116 | for(Hit hit : tmpArray){ 117 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 118 | if(hit.isMatch()){ 119 | //输出当前的词 120 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); 121 | context.addLexeme(newLexeme); 122 | 123 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 124 | this.countHits.remove(hit); 125 | } 126 | 127 | }else if(hit.isUnmatch()){ 128 | //hit不是词,移除 129 | this.countHits.remove(hit); 130 | } 131 | } 132 | } 133 | 134 | //********************************* 135 | //对当前指针位置的字符进行单字匹配 136 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); 137 | if(singleCharHit.isMatch()){//首字成量词词 138 | //输出当前的词 139 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); 140 | context.addLexeme(newLexeme); 141 | 142 | //同时也是词前缀 143 | if(singleCharHit.isPrefix()){ 144 | //前缀匹配则放入hit列表 145 | this.countHits.add(singleCharHit); 146 | } 147 | }else if(singleCharHit.isPrefix()){//首字为量词前缀 148 | //前缀匹配则放入hit列表 149 | this.countHits.add(singleCharHit); 150 | } 151 | 152 | 153 | }else{ 154 | //输入的不是中文字符 155 | //清空未成形的量词 156 | this.countHits.clear(); 157 | } 158 | 159 | //缓冲区数据已经读完,还有尚未输出的量词 160 | if(context.isBufferConsumed()){ 161 | //清空未成形的量词 162 | this.countHits.clear(); 163 | } 164 | } 165 | 166 | private boolean needCountScan(AnalyzeContext context){ 167 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){ 168 | //正在处理中文数词,或者正在处理量词 169 | return true; 170 | }else{ 171 | //找到一个相邻的数词 172 | if(!context.getOrgLexemes().isEmpty()){ 173 | Lexeme l = context.getOrgLexemes().peekLast(); 174 | if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){ 175 | if(l.getBegin() + l.getLength() == context.getCursor()){ 176 | return true; 177 | } 178 | } 179 | } 180 | } 181 | return false; 182 | } 183 | 184 | private void outputNumLexeme(AnalyzeContext context){ 185 | if(nStart > -1 && nEnd > -1){ 186 | //输出数词 187 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM); 188 | context.addLexeme(newLexeme); 189 | 190 | } 191 | } 192 | 193 | } 194 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/CharacterUtil.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.core; 2 | 3 | class CharacterUtil { 4 | 5 | public static final int CHAR_USELESS = 0; 6 | 7 | public static final int CHAR_ARABIC = 0X00000001; 8 | 9 | public static final int CHAR_ENGLISH = 0X00000002; 10 | 11 | public static final int CHAR_CHINESE = 0X00000004; 12 | 13 | public static final int CHAR_OTHER_CJK = 0X00000008; 14 | 15 | 16 | static int identifyCharType(char input){ 17 | if(input >= '0' && input <= '9'){ 18 | return CHAR_ARABIC; 19 | 20 | }else if((input >= 'a' && input <= 'z') 21 | || (input >= 'A' && input <= 'Z')){ 22 | return CHAR_ENGLISH; 23 | 24 | }else { 25 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); 26 | 27 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 28 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 29 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ 30 | //目前已知的中文字符UTF-8集合 31 | return CHAR_CHINESE; 32 | 33 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符 34 | //韩文字符集 35 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES 36 | || ub == Character.UnicodeBlock.HANGUL_JAMO 37 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 38 | //日文字符集 39 | || ub == Character.UnicodeBlock.HIRAGANA //平假名 40 | || ub == Character.UnicodeBlock.KATAKANA //片假名 41 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){ 42 | return CHAR_OTHER_CJK; 43 | 44 | } 45 | } 46 | //其他的不做处理的字符 47 | return CHAR_USELESS; 48 | } 49 | 50 | /** 51 | * 进行字符规格化(全角转半角,大写转小写处理) 52 | * @param input 53 | * @return char 54 | */ 55 | static char regularize(char input){ 56 | if (input == 12288) { 57 | input = (char) 32; 58 | 59 | }else if (input > 65280 && input < 65375) { 60 | input = (char) (input - 65248); 61 | 62 | }else if (input >= 'A' && input <= 'Z') { 63 | input += 32; 64 | } 65 | 66 | return input; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/IKArbitrator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Stack; 28 | import java.util.TreeSet; 29 | 30 | /** 31 | * IK分词歧义裁决器 32 | */ 33 | class IKArbitrator { 34 | 35 | IKArbitrator(){ 36 | 37 | } 38 | 39 | void process(AnalyzeContext context , boolean useSmart){ 40 | QuickSortSet orgLexemes = context.getOrgLexemes(); 41 | Lexeme orgLexeme = orgLexemes.pollFirst(); 42 | 43 | LexemePath crossPath = new LexemePath(); 44 | while(orgLexeme != null){ 45 | if(!crossPath.addCrossLexeme(orgLexeme)){ 46 | //找到与crossPath不相交的下一个crossPath 47 | if(crossPath.size() == 1 || !useSmart){ 48 | //crossPath没有歧义 或者 不做歧义处理 49 | //直接输出当前crossPath 50 | context.addLexemePath(crossPath); 51 | }else{ 52 | //对当前的crossPath进行歧义处理 53 | QuickSortSet.Cell headCell = crossPath.getHead(); 54 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 55 | //输出歧义处理结果judgeResult 56 | context.addLexemePath(judgeResult); 57 | } 58 | 59 | //把orgLexeme加入新的crossPath中 60 | crossPath = new LexemePath(); 61 | crossPath.addCrossLexeme(orgLexeme); 62 | } 63 | orgLexeme = orgLexemes.pollFirst(); 64 | } 65 | 66 | 67 | //处理最后的path 68 | if(crossPath.size() == 1 || !useSmart){ 69 | //crossPath没有歧义 或者 不做歧义处理 70 | //直接输出当前crossPath 71 | context.addLexemePath(crossPath); 72 | }else{ 73 | //对当前的crossPath进行歧义处理 74 | QuickSortSet.Cell headCell = crossPath.getHead(); 75 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 76 | //输出歧义处理结果judgeResult 77 | context.addLexemePath(judgeResult); 78 | } 79 | } 80 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ 81 | //候选路径集合 82 | TreeSet pathOptions = new TreeSet(); 83 | //候选结果路径 84 | LexemePath option = new LexemePath(); 85 | 86 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 87 | Stack lexemeStack = this.forwardPath(lexemeCell , option); 88 | 89 | //当前词元链并非最理想的,加入候选路径集合 90 | pathOptions.add(option.copy()); 91 | 92 | //存在歧义词,处理 93 | QuickSortSet.Cell c = null; 94 | while(!lexemeStack.isEmpty()){ 95 | c = lexemeStack.pop(); 96 | //回滚词元链 97 | this.backPath(c.getLexeme() , option); 98 | //从歧义词位置开始,递归,生成可选方案 99 | this.forwardPath(c , option); 100 | pathOptions.add(option.copy()); 101 | } 102 | 103 | //返回集合中的最优方案 104 | return pathOptions.first(); 105 | 106 | } 107 | 108 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ 109 | //发生冲突的Lexeme栈 110 | Stack conflictStack = new Stack(); 111 | QuickSortSet.Cell c = lexemeCell; 112 | //迭代遍历Lexeme链表 113 | while(c != null && c.getLexeme() != null){ 114 | if(!option.addNotCrossLexeme(c.getLexeme())){ 115 | //词元交叉,添加失败则加入lexemeStack栈 116 | conflictStack.push(c); 117 | } 118 | c = c.getNext(); 119 | } 120 | return conflictStack; 121 | } 122 | 123 | private void backPath(Lexeme l , LexemePath option){ 124 | while(option.checkCross(l)){ 125 | option.removeTail(); 126 | } 127 | 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/IKSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | */ 24 | package org.wltea.analyzer.core; 25 | 26 | import org.wltea.analyzer.configuration.DictionaryConfiguration; 27 | import org.wltea.analyzer.dic.Dictionary; 28 | 29 | import java.io.IOException; 30 | import java.io.Reader; 31 | import java.util.ArrayList; 32 | import java.util.List; 33 | 34 | /** 35 | * IK分词器主类 36 | */ 37 | public final class IKSegmenter { 38 | 39 | //字符窜reader 40 | private Reader input; 41 | //分词器配置项 42 | private DictionaryConfiguration cfg; 43 | //分词器上下文 44 | private AnalyzeContext context; 45 | //分词处理器列表 46 | private List segmenters; 47 | //分词歧义裁决器 48 | private IKArbitrator arbitrator; 49 | 50 | 51 | public IKSegmenter(Reader input, DictionaryConfiguration cfg) { 52 | this.input = input; 53 | this.cfg = cfg; 54 | 55 | //初始化词典单例 56 | Dictionary.initial(this.cfg); 57 | //初始化分词上下文 58 | this.context = new AnalyzeContext(this.cfg); 59 | //加载子分词器 60 | this.segmenters = this.loadSegmenters(); 61 | //加载歧义裁决器 62 | this.arbitrator = new IKArbitrator(); 63 | } 64 | 65 | 66 | private List loadSegmenters() { 67 | List segmenters = new ArrayList(4); 68 | //处理字母的子分词器 69 | segmenters.add(new LetterSegmenter()); 70 | //处理中文数量词的子分词器 71 | segmenters.add(new CN_QuantifierSegmenter()); 72 | //处理中文词的子分词器 73 | segmenters.add(new CJKSegmenter()); 74 | return segmenters; 75 | } 76 | 77 | public synchronized Lexeme next() throws IOException { 78 | if (this.context.hasNextResult()) { 79 | //存在尚未输出的分词结果 80 | return this.context.getNextLexeme(); 81 | } else { 82 | /* 83 | * 从reader中读取数据,填充buffer 84 | * 如果reader是分次读入buffer的,那么buffer要进行移位处理 85 | * 移位处理上次读入的但未处理的数据 86 | */ 87 | int available = context.fillBuffer(this.input); 88 | if (available <= 0) { 89 | //reader已经读完 90 | context.reset(); 91 | return null; 92 | 93 | } else { 94 | //初始化指针 95 | context.initCursor(); 96 | do { 97 | //遍历子分词器 98 | for (ISegmenter segmenter : segmenters) { 99 | segmenter.analyze(context); 100 | } 101 | //字符缓冲区接近读完,需要读入新的字符 102 | if (context.needRefillBuffer()) { 103 | break; 104 | } 105 | //向前移动指针 106 | } while (context.moveCursor()); 107 | //重置子分词器,为下轮循环进行初始化 108 | for (ISegmenter segmenter : segmenters) { 109 | segmenter.reset(); 110 | } 111 | } 112 | //对分词进行歧义处理 113 | this.arbitrator.process(context, this.cfg.isSmartMode()); 114 | //处理未切分CJK字符 115 | context.processUnkownCJKChar(); 116 | //记录本次分词的缓冲区位移 117 | context.markBufferOffset(); 118 | //输出词元 119 | if (this.context.hasNextResult()) { 120 | return this.context.getNextLexeme(); 121 | } 122 | return null; 123 | } 124 | } 125 | 126 | public synchronized void reset(Reader input) { 127 | this.input = input; 128 | context.reset(); 129 | for (ISegmenter segmenter : segmenters) { 130 | segmenter.reset(); 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/ISegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | 28 | /** 29 | * 30 | * 子分词器接口 31 | */ 32 | interface ISegmenter { 33 | 34 | void analyze(AnalyzeContext context); 35 | 36 | 37 | /** 38 | * 重置子分析器状态 39 | */ 40 | void reset(); 41 | 42 | } 43 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.core; 2 | 3 | import java.util.Arrays; 4 | 5 | class LetterSegmenter implements ISegmenter { 6 | 7 | //子分词器标签 8 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER"; 9 | //链接符号 10 | private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'}; 11 | 12 | //数字符号 13 | private static final char[] Num_Connector = new char[]{',' , '.'}; 14 | 15 | /* 16 | * 词元的开始位置, 17 | * 同时作为子分词器状态标识 18 | * 当start > -1 时,标识当前的分词器正在处理字符 19 | */ 20 | private int start; 21 | /* 22 | * 记录词元结束位置 23 | * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 24 | */ 25 | private int end; 26 | 27 | /* 28 | * 字母起始位置 29 | */ 30 | private int englishStart; 31 | 32 | /* 33 | * 字母结束位置 34 | */ 35 | private int englishEnd; 36 | 37 | /* 38 | * 阿拉伯数字起始位置 39 | */ 40 | private int arabicStart; 41 | 42 | /* 43 | * 阿拉伯数字结束位置 44 | */ 45 | private int arabicEnd; 46 | 47 | LetterSegmenter(){ 48 | Arrays.sort(Letter_Connector); 49 | Arrays.sort(Num_Connector); 50 | this.start = -1; 51 | this.end = -1; 52 | this.englishStart = -1; 53 | this.englishEnd = -1; 54 | this.arabicStart = -1; 55 | this.arabicEnd = -1; 56 | } 57 | 58 | 59 | /* (non-Javadoc) 60 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 61 | */ 62 | public void analyze(AnalyzeContext context) { 63 | boolean bufferLockFlag = false; 64 | //处理英文字母 65 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag; 66 | //处理阿拉伯字母 67 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag; 68 | //处理混合字母 69 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag; 70 | 71 | //判断是否锁定缓冲区 72 | if(bufferLockFlag){ 73 | context.lockBuffer(SEGMENTER_NAME); 74 | }else{ 75 | //对缓冲区解锁 76 | context.unlockBuffer(SEGMENTER_NAME); 77 | } 78 | } 79 | 80 | /* (non-Javadoc) 81 | * @see org.wltea.analyzer.core.ISegmenter#reset() 82 | */ 83 | public void reset() { 84 | this.start = -1; 85 | this.end = -1; 86 | this.englishStart = -1; 87 | this.englishEnd = -1; 88 | this.arabicStart = -1; 89 | this.arabicEnd = -1; 90 | } 91 | 92 | private boolean processMixLetter(AnalyzeContext context){ 93 | boolean needLock = false; 94 | 95 | if(this.start == -1){//当前的分词器尚未开始处理字符 96 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 97 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 98 | //记录起始指针的位置,标明分词器进入处理状态 99 | this.start = context.getCursor(); 100 | this.end = start; 101 | } 102 | 103 | }else{//当前的分词器正在处理字符 104 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 105 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 106 | //记录下可能的结束位置 107 | this.end = context.getCursor(); 108 | 109 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 110 | && this.isLetterConnector(context.getCurrentChar())){ 111 | //记录下可能的结束位置 112 | this.end = context.getCursor(); 113 | }else{ 114 | //遇到非Letter字符,输出词元 115 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); 116 | context.addLexeme(newLexeme); 117 | this.start = -1; 118 | this.end = -1; 119 | } 120 | } 121 | 122 | //判断缓冲区是否已经读完 123 | if(context.isBufferConsumed()){ 124 | if(this.start != -1 && this.end != -1){ 125 | //缓冲以读完,输出词元 126 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); 127 | context.addLexeme(newLexeme); 128 | this.start = -1; 129 | this.end = -1; 130 | } 131 | } 132 | 133 | //判断是否锁定缓冲区 134 | if(this.start == -1 && this.end == -1){ 135 | //对缓冲区解锁 136 | needLock = false; 137 | }else{ 138 | needLock = true; 139 | } 140 | return needLock; 141 | } 142 | 143 | private boolean processEnglishLetter(AnalyzeContext context){ 144 | boolean needLock = false; 145 | 146 | if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符 147 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 148 | //记录起始指针的位置,标明分词器进入处理状态 149 | this.englishStart = context.getCursor(); 150 | this.englishEnd = this.englishStart; 151 | } 152 | }else {//当前的分词器正在处理英文字符 153 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 154 | //记录当前指针位置为结束位置 155 | this.englishEnd = context.getCursor(); 156 | }else{ 157 | //遇到非English字符,输出词元 158 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); 159 | context.addLexeme(newLexeme); 160 | this.englishStart = -1; 161 | this.englishEnd= -1; 162 | } 163 | } 164 | 165 | //判断缓冲区是否已经读完 166 | if(context.isBufferConsumed()){ 167 | if(this.englishStart != -1 && this.englishEnd != -1){ 168 | //缓冲以读完,输出词元 169 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); 170 | context.addLexeme(newLexeme); 171 | this.englishStart = -1; 172 | this.englishEnd= -1; 173 | } 174 | } 175 | 176 | //判断是否锁定缓冲区 177 | if(this.englishStart == -1 && this.englishEnd == -1){ 178 | //对缓冲区解锁 179 | needLock = false; 180 | }else{ 181 | needLock = true; 182 | } 183 | return needLock; 184 | } 185 | 186 | private boolean processArabicLetter(AnalyzeContext context){ 187 | boolean needLock = false; 188 | 189 | if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符 190 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ 191 | //记录起始指针的位置,标明分词器进入处理状态 192 | this.arabicStart = context.getCursor(); 193 | this.arabicEnd = this.arabicStart; 194 | } 195 | }else {//当前的分词器正在处理数字字符 196 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ 197 | //记录当前指针位置为结束位置 198 | this.arabicEnd = context.getCursor(); 199 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 200 | && this.isNumConnector(context.getCurrentChar())){ 201 | //不输出数字,但不标记结束 202 | }else{ 203 | ////遇到非Arabic字符,输出词元 204 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); 205 | context.addLexeme(newLexeme); 206 | this.arabicStart = -1; 207 | this.arabicEnd = -1; 208 | } 209 | } 210 | 211 | //判断缓冲区是否已经读完 212 | if(context.isBufferConsumed()){ 213 | if(this.arabicStart != -1 && this.arabicEnd != -1){ 214 | //生成已切分的词元 215 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); 216 | context.addLexeme(newLexeme); 217 | this.arabicStart = -1; 218 | this.arabicEnd = -1; 219 | } 220 | } 221 | 222 | //判断是否锁定缓冲区 223 | if(this.arabicStart == -1 && this.arabicEnd == -1){ 224 | //对缓冲区解锁 225 | needLock = false; 226 | }else{ 227 | needLock = true; 228 | } 229 | return needLock; 230 | } 231 | 232 | private boolean isLetterConnector(char input){ 233 | int index = Arrays.binarySearch(Letter_Connector, input); 234 | return index >= 0; 235 | } 236 | 237 | private boolean isNumConnector(char input){ 238 | int index = Arrays.binarySearch(Num_Connector, input); 239 | return index >= 0; 240 | } 241 | } 242 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/Lexeme.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.core; 2 | 3 | public class Lexeme implements Comparable { 4 | //lexemeType常量 5 | //未知 6 | public static final int TYPE_UNKNOWN = 0; 7 | //英文 8 | public static final int TYPE_ENGLISH = 1; 9 | //数字 10 | public static final int TYPE_ARABIC = 2; 11 | //英文数字混合 12 | public static final int TYPE_LETTER = 3; 13 | //中文词元 14 | public static final int TYPE_CNWORD = 4; 15 | //中文单字 16 | public static final int TYPE_CNCHAR = 64; 17 | //日韩文字 18 | public static final int TYPE_OTHER_CJK = 8; 19 | //中文数词 20 | public static final int TYPE_CNUM = 16; 21 | //中文量词 22 | public static final int TYPE_COUNT = 32; 23 | //中文数量词 24 | public static final int TYPE_CQUAN = 48; 25 | 26 | //词元的起始位移 27 | private int offset; 28 | //词元的相对起始位置 29 | private int begin; 30 | //词元的长度 31 | private int length; 32 | //词元文本 33 | private String lexemeText; 34 | //词元类型 35 | private int lexemeType; 36 | 37 | 38 | public Lexeme(int offset, int begin, int length, int lexemeType) { 39 | this.offset = offset; 40 | this.begin = begin; 41 | if (length < 0) { 42 | throw new IllegalArgumentException("length < 0"); 43 | } 44 | this.length = length; 45 | this.lexemeType = lexemeType; 46 | } 47 | 48 | /* 49 | * 判断词元相等算法 50 | * 起始位置偏移、起始位置、终止位置相同 51 | * @see java.lang.Object#equals(Object o) 52 | */ 53 | public boolean equals(Object o) { 54 | if (o == null) { 55 | return false; 56 | } 57 | 58 | if (this == o) { 59 | return true; 60 | } 61 | 62 | if (o instanceof Lexeme) { 63 | Lexeme other = (Lexeme) o; 64 | if (this.offset == other.getOffset() 65 | && this.begin == other.getBegin() 66 | && this.length == other.getLength()) { 67 | return true; 68 | } else { 69 | return false; 70 | } 71 | } else { 72 | return false; 73 | } 74 | } 75 | 76 | /* 77 | * 词元哈希编码算法 78 | * @see java.lang.Object#hashCode() 79 | */ 80 | public int hashCode() { 81 | int absBegin = getBeginPosition(); 82 | int absEnd = getEndPosition(); 83 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; 84 | } 85 | 86 | /* 87 | * 词元在排序集合中的比较算法 88 | * @see java.lang.Comparable#compareTo(java.lang.Object) 89 | */ 90 | public int compareTo(Lexeme other) { 91 | //起始位置优先 92 | if (this.begin < other.getBegin()) { 93 | return -1; 94 | } else if (this.begin == other.getBegin()) { 95 | //词元长度优先 96 | if (this.length > other.getLength()) { 97 | return -1; 98 | } else if (this.length == other.getLength()) { 99 | return 0; 100 | } else {//this.length < other.getLength() 101 | return 1; 102 | } 103 | 104 | } else {//this.begin > other.getBegin() 105 | return 1; 106 | } 107 | } 108 | 109 | public int getOffset() { 110 | return offset; 111 | } 112 | 113 | public void setOffset(int offset) { 114 | this.offset = offset; 115 | } 116 | 117 | public int getBegin() { 118 | return begin; 119 | } 120 | 121 | public int getBeginPosition() { 122 | return offset + begin; 123 | } 124 | 125 | public void setBegin(int begin) { 126 | this.begin = begin; 127 | } 128 | 129 | public int getEndPosition() { 130 | return offset + begin + length; 131 | } 132 | 133 | public int getLength() { 134 | return this.length; 135 | } 136 | 137 | public void setLength(int length) { 138 | if (this.length < 0) { 139 | throw new IllegalArgumentException("length < 0"); 140 | } 141 | this.length = length; 142 | } 143 | 144 | public String getLexemeText() { 145 | if (lexemeText == null) { 146 | return ""; 147 | } 148 | return lexemeText; 149 | } 150 | 151 | public void setLexemeText(String lexemeText) { 152 | if (lexemeText == null) { 153 | this.lexemeText = ""; 154 | this.length = 0; 155 | } else { 156 | this.lexemeText = lexemeText; 157 | this.length = lexemeText.length(); 158 | } 159 | } 160 | 161 | public int getLexemeType() { 162 | return lexemeType; 163 | } 164 | 165 | public String getLexemeTypeString() { 166 | switch (lexemeType) { 167 | 168 | case TYPE_ENGLISH: 169 | return "ENGLISH"; 170 | 171 | case TYPE_ARABIC: 172 | return "ARABIC"; 173 | 174 | case TYPE_LETTER: 175 | return "LETTER"; 176 | 177 | case TYPE_CNWORD: 178 | return "CN_WORD"; 179 | 180 | case TYPE_CNCHAR: 181 | return "CN_CHAR"; 182 | 183 | case TYPE_OTHER_CJK: 184 | return "OTHER_CJK"; 185 | 186 | case TYPE_COUNT: 187 | return "COUNT"; 188 | 189 | case TYPE_CNUM: 190 | return "TYPE_CNUM"; 191 | 192 | case TYPE_CQUAN: 193 | return "TYPE_CQUAN"; 194 | 195 | default: 196 | return "UNKONW"; 197 | } 198 | } 199 | 200 | 201 | public void setLexemeType(int lexemeType) { 202 | this.lexemeType = lexemeType; 203 | } 204 | 205 | public boolean append(Lexeme l, int lexemeType) { 206 | if (l != null && this.getEndPosition() == l.getBeginPosition()) { 207 | this.length += l.getLength(); 208 | this.lexemeType = lexemeType; 209 | return true; 210 | } else { 211 | return false; 212 | } 213 | } 214 | 215 | 216 | public String toString() { 217 | StringBuffer strbuf = new StringBuffer(); 218 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition()); 219 | strbuf.append(" : ").append(this.lexemeText).append(" : \t"); 220 | strbuf.append(this.getLexemeTypeString()); 221 | return strbuf.toString(); 222 | } 223 | 224 | 225 | } 226 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/LexemePath.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.core; 2 | 3 | 4 | class LexemePath extends QuickSortSet implements Comparable { 5 | 6 | //起始位置 7 | private int pathBegin; 8 | //结束 9 | private int pathEnd; 10 | //词元链的有效字符长度 11 | private int payloadLength; 12 | 13 | LexemePath() { 14 | this.pathBegin = -1; 15 | this.pathEnd = -1; 16 | this.payloadLength = 0; 17 | } 18 | 19 | boolean addCrossLexeme(Lexeme lexeme) { 20 | if (this.isEmpty()) { 21 | this.addLexeme(lexeme); 22 | this.pathBegin = lexeme.getBegin(); 23 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 24 | this.payloadLength += lexeme.getLength(); 25 | return true; 26 | 27 | } else if (this.checkCross(lexeme)) { 28 | this.addLexeme(lexeme); 29 | if (lexeme.getBegin() + lexeme.getLength() > this.pathEnd) { 30 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 31 | } 32 | this.payloadLength = this.pathEnd - this.pathBegin; 33 | return true; 34 | 35 | } else { 36 | return false; 37 | 38 | } 39 | } 40 | 41 | boolean addNotCrossLexeme(Lexeme lexeme) { 42 | if (this.isEmpty()) { 43 | this.addLexeme(lexeme); 44 | this.pathBegin = lexeme.getBegin(); 45 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 46 | this.payloadLength += lexeme.getLength(); 47 | return true; 48 | 49 | } else if (this.checkCross(lexeme)) { 50 | return false; 51 | 52 | } else { 53 | this.addLexeme(lexeme); 54 | this.payloadLength += lexeme.getLength(); 55 | Lexeme head = this.peekFirst(); 56 | this.pathBegin = head.getBegin(); 57 | Lexeme tail = this.peekLast(); 58 | this.pathEnd = tail.getBegin() + tail.getLength(); 59 | return true; 60 | 61 | } 62 | } 63 | 64 | Lexeme removeTail() { 65 | Lexeme tail = this.pollLast(); 66 | if (this.isEmpty()) { 67 | this.pathBegin = -1; 68 | this.pathEnd = -1; 69 | this.payloadLength = 0; 70 | } else { 71 | this.payloadLength -= tail.getLength(); 72 | Lexeme newTail = this.peekLast(); 73 | this.pathEnd = newTail.getBegin() + newTail.getLength(); 74 | } 75 | return tail; 76 | } 77 | 78 | boolean checkCross(Lexeme lexeme) { 79 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd) 80 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin() + lexeme.getLength()); 81 | } 82 | 83 | int getPathBegin() { 84 | return pathBegin; 85 | } 86 | 87 | int getPathEnd() { 88 | return pathEnd; 89 | } 90 | 91 | int getPayloadLength() { 92 | return this.payloadLength; 93 | } 94 | 95 | int getPathLength() { 96 | return this.pathEnd - this.pathBegin; 97 | } 98 | 99 | 100 | int getXWeight() { 101 | int product = 1; 102 | Cell c = this.getHead(); 103 | while (c != null && c.getLexeme() != null) { 104 | product *= c.getLexeme().getLength(); 105 | c = c.getNext(); 106 | } 107 | return product; 108 | } 109 | 110 | int getPWeight() { 111 | int pWeight = 0; 112 | int p = 0; 113 | Cell c = this.getHead(); 114 | while (c != null && c.getLexeme() != null) { 115 | p++; 116 | pWeight += p * c.getLexeme().getLength(); 117 | c = c.getNext(); 118 | } 119 | return pWeight; 120 | } 121 | 122 | LexemePath copy() { 123 | LexemePath theCopy = new LexemePath(); 124 | theCopy.pathBegin = this.pathBegin; 125 | theCopy.pathEnd = this.pathEnd; 126 | theCopy.payloadLength = this.payloadLength; 127 | Cell c = this.getHead(); 128 | while (c != null && c.getLexeme() != null) { 129 | theCopy.addLexeme(c.getLexeme()); 130 | c = c.getNext(); 131 | } 132 | return theCopy; 133 | } 134 | 135 | public int compareTo(LexemePath o) { 136 | //比较有效文本长度 137 | if (this.payloadLength > o.payloadLength) { 138 | return -1; 139 | } else if (this.payloadLength < o.payloadLength) { 140 | return 1; 141 | } else { 142 | //比较词元个数,越少越好 143 | if (this.size() < o.size()) { 144 | return -1; 145 | } else if (this.size() > o.size()) { 146 | return 1; 147 | } else { 148 | //路径跨度越大越好 149 | if (this.getPathLength() > o.getPathLength()) { 150 | return -1; 151 | } else if (this.getPathLength() < o.getPathLength()) { 152 | return 1; 153 | } else { 154 | //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先 155 | if (this.pathEnd > o.pathEnd) { 156 | return -1; 157 | } else if (pathEnd < o.pathEnd) { 158 | return 1; 159 | } else { 160 | //词长越平均越好 161 | if (this.getXWeight() > o.getXWeight()) { 162 | return -1; 163 | } else if (this.getXWeight() < o.getXWeight()) { 164 | return 1; 165 | } else { 166 | //词元位置权重比较 167 | if (this.getPWeight() > o.getPWeight()) { 168 | return -1; 169 | } else if (this.getPWeight() < o.getPWeight()) { 170 | return 1; 171 | } 172 | 173 | } 174 | } 175 | } 176 | } 177 | } 178 | return 0; 179 | } 180 | 181 | public String toString() { 182 | StringBuffer sb = new StringBuffer(); 183 | sb.append("pathBegin : ").append(pathBegin).append("\r\n"); 184 | sb.append("pathEnd : ").append(pathEnd).append("\r\n"); 185 | sb.append("payloadLength : ").append(payloadLength).append("\r\n"); 186 | Cell head = this.getHead(); 187 | while (head != null) { 188 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n"); 189 | head = head.getNext(); 190 | } 191 | return sb.toString(); 192 | } 193 | 194 | } 195 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/core/QuickSortSet.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.core; 2 | 3 | class QuickSortSet { 4 | //链表头 5 | private Cell head; 6 | //链表尾 7 | private Cell tail; 8 | //链表的实际大小 9 | private int size; 10 | 11 | QuickSortSet(){ 12 | this.size = 0; 13 | } 14 | 15 | boolean addLexeme(Lexeme lexeme){ 16 | Cell newCell = new Cell(lexeme); 17 | if(this.size == 0){ 18 | this.head = newCell; 19 | this.tail = newCell; 20 | this.size++; 21 | return true; 22 | 23 | }else{ 24 | if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合 25 | return false; 26 | 27 | }else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部 28 | this.tail.next = newCell; 29 | newCell.prev = this.tail; 30 | this.tail = newCell; 31 | this.size++; 32 | return true; 33 | 34 | }else if(this.head.compareTo(newCell) > 0){//词元接入链表头部 35 | this.head.prev = newCell; 36 | newCell.next = this.head; 37 | this.head = newCell; 38 | this.size++; 39 | return true; 40 | 41 | }else{ 42 | //从尾部上逆 43 | Cell index = this.tail; 44 | while(index != null && index.compareTo(newCell) > 0){ 45 | index = index.prev; 46 | } 47 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合 48 | return false; 49 | 50 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置 51 | newCell.prev = index; 52 | newCell.next = index.next; 53 | index.next.prev = newCell; 54 | index.next = newCell; 55 | this.size++; 56 | return true; 57 | } 58 | } 59 | } 60 | return false; 61 | } 62 | 63 | Lexeme peekFirst(){ 64 | if(this.head != null){ 65 | return this.head.lexeme; 66 | } 67 | return null; 68 | } 69 | 70 | Lexeme pollFirst(){ 71 | if(this.size == 1){ 72 | Lexeme first = this.head.lexeme; 73 | this.head = null; 74 | this.tail = null; 75 | this.size--; 76 | return first; 77 | }else if(this.size > 1){ 78 | Lexeme first = this.head.lexeme; 79 | this.head = this.head.next; 80 | this.size --; 81 | return first; 82 | }else{ 83 | return null; 84 | } 85 | } 86 | 87 | Lexeme peekLast(){ 88 | if(this.tail != null){ 89 | return this.tail.lexeme; 90 | } 91 | return null; 92 | } 93 | 94 | Lexeme pollLast(){ 95 | if(this.size == 1){ 96 | Lexeme last = this.head.lexeme; 97 | this.head = null; 98 | this.tail = null; 99 | this.size--; 100 | return last; 101 | 102 | }else if(this.size > 1){ 103 | Lexeme last = this.tail.lexeme; 104 | this.tail = this.tail.prev; 105 | this.size--; 106 | return last; 107 | 108 | }else{ 109 | return null; 110 | } 111 | } 112 | 113 | int size(){ 114 | return this.size; 115 | } 116 | 117 | boolean isEmpty(){ 118 | return this.size == 0; 119 | } 120 | 121 | Cell getHead(){ 122 | return this.head; 123 | } 124 | 125 | class Cell implements Comparable{ 126 | private Cell prev; 127 | private Cell next; 128 | private Lexeme lexeme; 129 | 130 | Cell(Lexeme lexeme){ 131 | if(lexeme == null){ 132 | throw new IllegalArgumentException("lexeme must not be null"); 133 | } 134 | this.lexeme = lexeme; 135 | } 136 | 137 | public int compareTo(Cell o) { 138 | return this.lexeme.compareTo(o.lexeme); 139 | } 140 | 141 | public Cell getPrev(){ 142 | return this.prev; 143 | } 144 | 145 | public Cell getNext(){ 146 | return this.next; 147 | } 148 | 149 | public Lexeme getLexeme(){ 150 | return this.lexeme; 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/dic/DictSegment.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.dic; 2 | 3 | import java.util.Arrays; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | class DictSegment implements Comparable{ 8 | 9 | //公用字典表,存储汉字 10 | private static final Map charMap = new HashMap(16 , 0.95f); 11 | //数组大小上限 12 | private static final int ARRAY_LENGTH_LIMIT = 3; 13 | 14 | 15 | //Map存储结构 16 | private Map childrenMap; 17 | //数组方式存储结构 18 | private DictSegment[] childrenArray; 19 | 20 | 21 | //当前节点上存储的字符 22 | private Character nodeChar; 23 | //当前节点存储的Segment数目 24 | //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储 25 | private int storeSize = 0; 26 | //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词 27 | private int nodeState = 0; 28 | 29 | 30 | DictSegment(Character nodeChar){ 31 | if(nodeChar == null){ 32 | throw new IllegalArgumentException("参数为空异常,字符不能为空"); 33 | } 34 | this.nodeChar = nodeChar; 35 | } 36 | 37 | Character getNodeChar() { 38 | return nodeChar; 39 | } 40 | 41 | /* 42 | * 判断是否有下一个节点 43 | */ 44 | boolean hasNextNode(){ 45 | return this.storeSize > 0; 46 | } 47 | 48 | Hit match(char[] charArray){ 49 | return this.match(charArray , 0 , charArray.length , null); 50 | } 51 | 52 | Hit match(char[] charArray , int begin , int length){ 53 | return this.match(charArray , begin , length , null); 54 | } 55 | 56 | Hit match(char[] charArray , int begin , int length , Hit searchHit){ 57 | 58 | if(searchHit == null){ 59 | //如果hit为空,新建 60 | searchHit= new Hit(); 61 | //设置hit的其实文本位置 62 | searchHit.setBegin(begin); 63 | }else{ 64 | //否则要将HIT状态重置 65 | searchHit.setUnmatch(); 66 | } 67 | //设置hit的当前处理位置 68 | searchHit.setEnd(begin); 69 | 70 | Character keyChar = new Character(charArray[begin]); 71 | DictSegment ds = null; 72 | 73 | //引用实例变量为本地变量,避免查询时遇到更新的同步问题 74 | DictSegment[] segmentArray = this.childrenArray; 75 | Map segmentMap = this.childrenMap; 76 | 77 | //STEP1 在节点中查找keyChar对应的DictSegment 78 | if(segmentArray != null){ 79 | //在数组中查找 80 | DictSegment keySegment = new DictSegment(keyChar); 81 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment); 82 | if(position >= 0){ 83 | ds = segmentArray[position]; 84 | } 85 | 86 | }else if(segmentMap != null){ 87 | //在map中查找 88 | ds = segmentMap.get(keyChar); 89 | } 90 | 91 | //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果 92 | if(ds != null){ 93 | if(length > 1){ 94 | //词未匹配完,继续往下搜索 95 | return ds.match(charArray, begin + 1 , length - 1 , searchHit); 96 | }else if (length == 1){ 97 | 98 | //搜索最后一个char 99 | if(ds.nodeState == 1){ 100 | //添加HIT状态为完全匹配 101 | searchHit.setMatch(); 102 | } 103 | if(ds.hasNextNode()){ 104 | //添加HIT状态为前缀匹配 105 | searchHit.setPrefix(); 106 | //记录当前位置的DictSegment 107 | searchHit.setMatchedDictSegment(ds); 108 | } 109 | return searchHit; 110 | } 111 | 112 | } 113 | //STEP3 没有找到DictSegment, 将HIT设置为不匹配 114 | return searchHit; 115 | } 116 | 117 | void fillSegment(char[] charArray){ 118 | this.fillSegment(charArray, 0 , charArray.length , 1); 119 | } 120 | 121 | void disableSegment(char[] charArray){ 122 | this.fillSegment(charArray, 0 , charArray.length , 0); 123 | } 124 | 125 | private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){ 126 | //获取字典表中的汉字对象 127 | Character beginChar = new Character(charArray[begin]); 128 | Character keyChar = charMap.get(beginChar); 129 | //字典中没有该字,则将其添加入字典 130 | if(keyChar == null){ 131 | charMap.put(beginChar, beginChar); 132 | keyChar = beginChar; 133 | } 134 | 135 | //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建 136 | DictSegment ds = lookforSegment(keyChar , enabled); 137 | if(ds != null){ 138 | //JSONUtils 139 | if(length > 1){ 140 | //词元还没有完全加入词典树 141 | ds.fillSegment(charArray, begin + 1, length - 1 , enabled); 142 | }else if (length == 1){ 143 | //已经是词元的最后一个char,设置当前节点状态为enabled, 144 | //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词 145 | ds.nodeState = enabled; 146 | } 147 | } 148 | 149 | } 150 | 151 | private DictSegment lookforSegment(Character keyChar , int create){ 152 | 153 | DictSegment ds = null; 154 | 155 | if(this.storeSize <= ARRAY_LENGTH_LIMIT){ 156 | //获取数组容器,如果数组未创建则创建数组 157 | DictSegment[] segmentArray = getChildrenArray(); 158 | //搜寻数组 159 | DictSegment keySegment = new DictSegment(keyChar); 160 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment); 161 | if(position >= 0){ 162 | ds = segmentArray[position]; 163 | } 164 | 165 | //遍历数组后没有找到对应的segment 166 | if(ds == null && create == 1){ 167 | ds = keySegment; 168 | if(this.storeSize < ARRAY_LENGTH_LIMIT){ 169 | //数组容量未满,使用数组存储 170 | segmentArray[this.storeSize] = ds; 171 | //segment数目+1 172 | this.storeSize++; 173 | Arrays.sort(segmentArray , 0 , this.storeSize); 174 | 175 | }else{ 176 | //数组容量已满,切换Map存储 177 | //获取Map容器,如果Map未创建,则创建Map 178 | Map segmentMap = getChildrenMap(); 179 | //将数组中的segment迁移到Map中 180 | migrate(segmentArray , segmentMap); 181 | //存储新的segment 182 | segmentMap.put(keyChar, ds); 183 | //segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组 184 | this.storeSize++; 185 | //释放当前的数组引用 186 | this.childrenArray = null; 187 | } 188 | 189 | } 190 | 191 | }else{ 192 | //获取Map容器,如果Map未创建,则创建Map 193 | Map segmentMap = getChildrenMap(); 194 | //搜索Map 195 | ds = segmentMap.get(keyChar); 196 | if(ds == null && create == 1){ 197 | //构造新的segment 198 | ds = new DictSegment(keyChar); 199 | segmentMap.put(keyChar , ds); 200 | //当前节点存储segment数目+1 201 | this.storeSize ++; 202 | } 203 | } 204 | 205 | return ds; 206 | } 207 | 208 | 209 | private DictSegment[] getChildrenArray(){ 210 | if(this.childrenArray == null){ 211 | synchronized(this){ 212 | if(this.childrenArray == null){ 213 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT]; 214 | } 215 | } 216 | } 217 | return this.childrenArray; 218 | } 219 | 220 | private Map getChildrenMap(){ 221 | if(this.childrenMap == null){ 222 | synchronized(this){ 223 | if(this.childrenMap == null){ 224 | this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2,0.8f); 225 | } 226 | } 227 | } 228 | return this.childrenMap; 229 | } 230 | 231 | private void migrate(DictSegment[] segmentArray , Map segmentMap){ 232 | for(DictSegment segment : segmentArray){ 233 | if(segment != null){ 234 | segmentMap.put(segment.nodeChar, segment); 235 | } 236 | } 237 | } 238 | 239 | public int compareTo(DictSegment o) { 240 | //对当前节点存储的char进行比较 241 | return this.nodeChar.compareTo(o.nodeChar); 242 | } 243 | 244 | } 245 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/dic/Dictionary.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | import java.util.Collection; 29 | 30 | import org.wltea.analyzer.configuration.DictionaryConfiguration; 31 | 32 | /** 33 | * 词典管理类,单子模式 34 | */ 35 | public class Dictionary { 36 | 37 | private static Dictionary singleton; 38 | 39 | private DictSegment _MainDict; 40 | 41 | /* 42 | * 停止词词典 43 | */ 44 | private DictSegment _StopWordDict; 45 | /* 46 | * 量词词典 47 | */ 48 | private DictSegment _QuantifierDict; 49 | 50 | /** 51 | * 配置对象 52 | */ 53 | private DictionaryConfiguration cfg; 54 | 55 | private Dictionary(DictionaryConfiguration cfg){ 56 | this.cfg = cfg; 57 | this.loadMainDict(); 58 | this.loadStopWordDict(); 59 | this.loadQuantifierDict(); 60 | } 61 | 62 | public static Dictionary initial(DictionaryConfiguration cfg){ 63 | if(singleton == null){ 64 | synchronized(Dictionary.class){ 65 | if(singleton == null){ 66 | singleton = new Dictionary(cfg); 67 | return singleton; 68 | } 69 | } 70 | } 71 | return singleton; 72 | } 73 | 74 | public static Dictionary getSingleton(){ 75 | if(singleton == null){ 76 | throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); 77 | } 78 | return singleton; 79 | } 80 | 81 | public void addWords(Collection words){ 82 | if(words != null){ 83 | for(String word : words){ 84 | if (word != null) { 85 | //批量加载词条到主内存词典中 86 | singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray()); 87 | } 88 | } 89 | } 90 | } 91 | 92 | public void disableWords(Collection words){ 93 | if(words != null){ 94 | for(String word : words){ 95 | if (word != null) { 96 | //批量屏蔽词条 97 | singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray()); 98 | } 99 | } 100 | } 101 | } 102 | 103 | public Hit matchInMainDict(char[] charArray){ 104 | return singleton._MainDict.match(charArray); 105 | } 106 | 107 | public Hit matchInMainDict(char[] charArray , int begin, int length){ 108 | return singleton._MainDict.match(charArray, begin, length); 109 | } 110 | 111 | public Hit matchInQuantifierDict(char[] charArray , int begin, int length){ 112 | return singleton._QuantifierDict.match(charArray, begin, length); 113 | } 114 | 115 | 116 | public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){ 117 | DictSegment ds = matchedHit.getMatchedDictSegment(); 118 | return ds.match(charArray, currentIndex, 1 , matchedHit); 119 | } 120 | 121 | 122 | public boolean isStopWord(char[] charArray , int begin, int length){ 123 | return singleton._StopWordDict.match(charArray, begin, length).isMatch(); 124 | } 125 | 126 | /** 127 | * 加载主词典及扩展词典 128 | */ 129 | private void loadMainDict() { 130 | //建立一个主词典实例 131 | _MainDict = new DictSegment((char) 0); 132 | for (char[] segment : cfg.getMainDictionary()) { 133 | _MainDict.fillSegment(segment); 134 | 135 | } 136 | } 137 | 138 | /** 139 | * 加载用户扩展的停止词词典 140 | */ 141 | private void loadStopWordDict(){ 142 | //建立一个主词典实例 143 | _StopWordDict = new DictSegment((char)0); 144 | for (char[] segment : cfg.getStopWordDictionary()) { 145 | _StopWordDict.fillSegment(segment); 146 | } 147 | 148 | } 149 | 150 | /** 151 | * 加载量词词典 152 | */ 153 | private void loadQuantifierDict(){ 154 | //建立一个量词典实例 155 | _QuantifierDict = new DictSegment((char)0); 156 | for (char[] segment : cfg.getQuantifierDictionary()) { 157 | _QuantifierDict.fillSegment(segment); 158 | } 159 | 160 | } 161 | 162 | } 163 | -------------------------------------------------------------------------------- /ik-analysis-core/src/main/java/org/wltea/analyzer/dic/Hit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | /* 29 | * 表示一次词典匹配的命中 30 | */ 31 | public class Hit { 32 | //Hit不匹配 33 | private static final int UNMATCH = 0x00000000; 34 | //Hit完全匹配 35 | private static final int MATCH = 0x00000001; 36 | //Hit前缀匹配 37 | private static final int PREFIX = 0x00000010; 38 | 39 | 40 | //该HIT当前状态,默认未匹配 41 | private int hitState = UNMATCH; 42 | 43 | //记录词典匹配过程中,当前匹配到的词典分支节点 44 | private DictSegment matchedDictSegment; 45 | /* 46 | * 词段开始位置 47 | */ 48 | private int begin; 49 | /* 50 | * 词段的结束位置 51 | */ 52 | private int end; 53 | 54 | 55 | /* 56 | * 判断是否完全匹配 57 | */ 58 | public boolean isMatch() { 59 | return (this.hitState & MATCH) > 0; 60 | } 61 | /* 62 | * 63 | */ 64 | public void setMatch() { 65 | this.hitState = this.hitState | MATCH; 66 | } 67 | 68 | /* 69 | * 判断是否是词的前缀 70 | */ 71 | public boolean isPrefix() { 72 | return (this.hitState & PREFIX) > 0; 73 | } 74 | /* 75 | * 76 | */ 77 | public void setPrefix() { 78 | this.hitState = this.hitState | PREFIX; 79 | } 80 | /* 81 | * 判断是否是不匹配 82 | */ 83 | public boolean isUnmatch() { 84 | return this.hitState == UNMATCH ; 85 | } 86 | /* 87 | * 88 | */ 89 | public void setUnmatch() { 90 | this.hitState = UNMATCH; 91 | } 92 | 93 | public DictSegment getMatchedDictSegment() { 94 | return matchedDictSegment; 95 | } 96 | 97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) { 98 | this.matchedDictSegment = matchedDictSegment; 99 | } 100 | 101 | public int getBegin() { 102 | return begin; 103 | } 104 | 105 | public void setBegin(int begin) { 106 | this.begin = begin; 107 | } 108 | 109 | public int getEnd() { 110 | return end; 111 | } 112 | 113 | public void setEnd(int end) { 114 | this.end = end; 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /ik-analysis-core/src/test/java/org/wltea/analyzer/IKSegmenterTest.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | import org.wltea.analyzer.core.IKSegmenter; 6 | import org.wltea.analyzer.core.Lexeme; 7 | 8 | import java.io.Reader; 9 | import java.io.StringReader; 10 | 11 | public class IKSegmenterTest { 12 | 13 | @Test 14 | public void testSegment() throws Exception { 15 | Reader in = new StringReader("一一分 准确值就是它们听上去的那样。干柴诸如日期或用户ID。当然字符串也可以是准确值,如用户名或邮件地址。准确值Foo与准确值foo是不同的。准确值2014和准确值2014-09-15也是不同的。测试"); 16 | boolean useSmart = true; 17 | IKSegmenter segmenter = new IKSegmenter(in, MockDictionary.smartModeSqlite3Configure()); 18 | 19 | assertSegmenterCorrect(segmenter.next(), "一一分", 0, 3, 3, "CN_WORD"); 20 | assertSegmenterCorrect(segmenter.next(), "准确值", 4, 7, 3, "CN_WORD"); 21 | assertSegmenterCorrect(segmenter.next(), "听", 11, 12, 1, "CN_WORD"); 22 | assertSegmenterCorrect(segmenter.next(), "上去", 12, 14, 2, "CN_WORD"); 23 | assertSegmenterCorrect(segmenter.next(), "干柴", 18, 20, 2, "CN_WORD"); 24 | assertSegmenterCorrect(segmenter.next(), "诸如", 20, 22, 2, "CN_WORD"); 25 | assertSegmenterCorrect(segmenter.next(), "日期", 22, 24, 2, "CN_WORD"); 26 | assertSegmenterCorrect(segmenter.next(), "用户", 25, 27, 2, "CN_WORD"); 27 | assertSegmenterCorrect(segmenter.next(), "id", 27, 29, 2, "ENGLISH"); 28 | assertSegmenterCorrect(segmenter.next(), "当然", 30, 32, 2, "CN_WORD"); 29 | assertSegmenterCorrect(segmenter.next(), "字符串", 32, 35, 3, "CN_WORD"); 30 | assertSegmenterCorrect(segmenter.next(), "以是", 37, 39, 2, "CN_WORD"); 31 | assertSegmenterCorrect(segmenter.next(), "准确值", 39, 42, 3, "CN_WORD"); 32 | assertSegmenterCorrect(segmenter.next(), "用户名", 44, 47, 3, "CN_WORD"); 33 | assertSegmenterCorrect(segmenter.next(), "邮件地址", 48, 52, 4, "CN_WORD"); 34 | assertSegmenterCorrect(segmenter.next(), "准确值", 53, 56, 3, "CN_WORD"); 35 | assertSegmenterCorrect(segmenter.next(), "foo", 56, 59, 3, "ENGLISH"); 36 | assertSegmenterCorrect(segmenter.next(), "准确值", 60, 63, 3, "CN_WORD"); 37 | assertSegmenterCorrect(segmenter.next(), "foo", 63, 66, 3, "ENGLISH"); 38 | assertSegmenterCorrect(segmenter.next(), "不同", 67, 69, 2, "CN_WORD"); 39 | assertSegmenterCorrect(segmenter.next(), "准确值", 71, 74, 3, "CN_WORD"); 40 | assertSegmenterCorrect(segmenter.next(), "2014", 74, 78, 4, "ARABIC"); 41 | assertSegmenterCorrect(segmenter.next(), "准确值", 79, 82, 3, "CN_WORD"); 42 | assertSegmenterCorrect(segmenter.next(), "2014-09-15", 82, 92, 10, "LETTER"); 43 | assertSegmenterCorrect(segmenter.next(), "也是", 92, 94, 2, "CN_WORD"); 44 | assertSegmenterCorrect(segmenter.next(), "不同", 94, 96, 2, "CN_WORD"); 45 | assertSegmenterCorrect(segmenter.next(), "测试", 98, 100, 2, "CN_WORD"); 46 | } 47 | 48 | private void assertSegmenterCorrect(Lexeme nextLexeme, String lexemeText, int begin, int end, int length, String type) { 49 | Assert.assertEquals(nextLexeme.getLexemeText(), lexemeText); 50 | Assert.assertEquals(nextLexeme.getBeginPosition(), begin); 51 | Assert.assertEquals(nextLexeme.getEndPosition(), end); 52 | Assert.assertEquals(nextLexeme.getLength(), length); 53 | Assert.assertEquals(nextLexeme.getLexemeTypeString(), type); 54 | 55 | } 56 | 57 | private void print(Lexeme nextLexeme){ 58 | System.out.println(nextLexeme.getLexemeText()); 59 | System.out.println(nextLexeme.getBeginPosition()); 60 | System.out.println(nextLexeme.getEndPosition()); 61 | System.out.println(nextLexeme.getLength()); 62 | System.out.println(nextLexeme.getLexemeTypeString()); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /ik-analysis-core/src/test/java/org/wltea/analyzer/MockDictionary.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer; 2 | 3 | 4 | import org.wltea.analyzer.configuration.DictionaryConfiguration; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | public class MockDictionary implements DictionaryConfiguration { 10 | 11 | private final List mainDictionary; 12 | private final List quantifierDictionary; 13 | private final List stopWordDictionary; 14 | 15 | 16 | private boolean smartMode = true; 17 | 18 | 19 | private MockDictionary() { 20 | 21 | mainDictionary = new ArrayList(); 22 | quantifierDictionary = new ArrayList(); 23 | stopWordDictionary = new ArrayList(); 24 | 25 | 26 | initStopWordDictionary(stopWordDictionary); 27 | 28 | mainDictionary.add("一一分".toCharArray()); 29 | mainDictionary.add("听".toCharArray()); 30 | mainDictionary.add("上去".toCharArray()); 31 | mainDictionary.add("那样".toCharArray()); 32 | mainDictionary.add("干柴".toCharArray()); 33 | mainDictionary.add("诸如".toCharArray()); 34 | mainDictionary.add("日期".toCharArray()); 35 | mainDictionary.add("用户".toCharArray()); 36 | mainDictionary.add("当然".toCharArray()); 37 | mainDictionary.add("字符串".toCharArray()); 38 | mainDictionary.add("以是".toCharArray()); 39 | mainDictionary.add("准确值".toCharArray()); 40 | mainDictionary.add("用户名".toCharArray()); 41 | mainDictionary.add("邮件地址".toCharArray()); 42 | mainDictionary.add("准确值".toCharArray()); 43 | mainDictionary.add("不同".toCharArray()); 44 | mainDictionary.add("也是".toCharArray()); 45 | mainDictionary.add("测试".toCharArray()); 46 | } 47 | 48 | public static MockDictionary smartModeSqlite3Configure() { 49 | MockDictionary sqlite3Configure = new MockDictionary(); 50 | sqlite3Configure.setSmartMode(true); 51 | return sqlite3Configure; 52 | } 53 | 54 | private void initStopWordDictionary(List stopWordDictionary) { 55 | stopWordDictionary.add("a".toCharArray()); 56 | stopWordDictionary.add("an".toCharArray()); 57 | stopWordDictionary.add("and".toCharArray()); 58 | stopWordDictionary.add("are".toCharArray()); 59 | stopWordDictionary.add("as".toCharArray()); 60 | stopWordDictionary.add("at".toCharArray()); 61 | stopWordDictionary.add("be".toCharArray()); 62 | stopWordDictionary.add("but".toCharArray()); 63 | stopWordDictionary.add("by".toCharArray()); 64 | stopWordDictionary.add("for".toCharArray()); 65 | stopWordDictionary.add("if".toCharArray()); 66 | stopWordDictionary.add("in".toCharArray()); 67 | stopWordDictionary.add("into".toCharArray()); 68 | stopWordDictionary.add("is".toCharArray()); 69 | stopWordDictionary.add("it".toCharArray()); 70 | stopWordDictionary.add("no".toCharArray()); 71 | stopWordDictionary.add("not".toCharArray()); 72 | stopWordDictionary.add("of".toCharArray()); 73 | stopWordDictionary.add("on".toCharArray()); 74 | stopWordDictionary.add("or".toCharArray()); 75 | stopWordDictionary.add("such".toCharArray()); 76 | stopWordDictionary.add("that".toCharArray()); 77 | stopWordDictionary.add("the".toCharArray()); 78 | stopWordDictionary.add("their".toCharArray()); 79 | stopWordDictionary.add("then".toCharArray()); 80 | stopWordDictionary.add("there".toCharArray()); 81 | stopWordDictionary.add("these".toCharArray()); 82 | stopWordDictionary.add("they".toCharArray()); 83 | stopWordDictionary.add("this".toCharArray()); 84 | stopWordDictionary.add("to".toCharArray()); 85 | stopWordDictionary.add("was".toCharArray()); 86 | stopWordDictionary.add("will".toCharArray()); 87 | stopWordDictionary.add("with".toCharArray()); 88 | stopWordDictionary.add("更好的".toCharArray()); 89 | stopWordDictionary.add("选择".toCharArray()); 90 | stopWordDictionary.add("啊".toCharArray()); 91 | stopWordDictionary.add("阿".toCharArray()); 92 | stopWordDictionary.add("哎".toCharArray()); 93 | stopWordDictionary.add("哎呀".toCharArray()); 94 | stopWordDictionary.add("哎哟".toCharArray()); 95 | stopWordDictionary.add("唉".toCharArray()); 96 | stopWordDictionary.add("俺".toCharArray()); 97 | stopWordDictionary.add("俺们".toCharArray()); 98 | stopWordDictionary.add("按".toCharArray()); 99 | stopWordDictionary.add("按照".toCharArray()); 100 | stopWordDictionary.add("吧".toCharArray()); 101 | stopWordDictionary.add("吧哒".toCharArray()); 102 | stopWordDictionary.add("把".toCharArray()); 103 | stopWordDictionary.add("罢了".toCharArray()); 104 | stopWordDictionary.add("被".toCharArray()); 105 | stopWordDictionary.add("本".toCharArray()); 106 | stopWordDictionary.add("本着".toCharArray()); 107 | stopWordDictionary.add("比".toCharArray()); 108 | stopWordDictionary.add("比方".toCharArray()); 109 | stopWordDictionary.add("比如".toCharArray()); 110 | stopWordDictionary.add("鄙人".toCharArray()); 111 | stopWordDictionary.add("彼".toCharArray()); 112 | stopWordDictionary.add("彼此".toCharArray()); 113 | stopWordDictionary.add("边".toCharArray()); 114 | stopWordDictionary.add("别".toCharArray()); 115 | stopWordDictionary.add("别的".toCharArray()); 116 | stopWordDictionary.add("别说".toCharArray()); 117 | stopWordDictionary.add("并".toCharArray()); 118 | stopWordDictionary.add("并且".toCharArray()); 119 | stopWordDictionary.add("不比".toCharArray()); 120 | stopWordDictionary.add("不成".toCharArray()); 121 | stopWordDictionary.add("不单".toCharArray()); 122 | stopWordDictionary.add("不但".toCharArray()); 123 | stopWordDictionary.add("不独".toCharArray()); 124 | stopWordDictionary.add("不管".toCharArray()); 125 | stopWordDictionary.add("不光".toCharArray()); 126 | stopWordDictionary.add("不过".toCharArray()); 127 | stopWordDictionary.add("不仅".toCharArray()); 128 | stopWordDictionary.add("不拘".toCharArray()); 129 | stopWordDictionary.add("不论".toCharArray()); 130 | stopWordDictionary.add("不怕".toCharArray()); 131 | stopWordDictionary.add("不然".toCharArray()); 132 | stopWordDictionary.add("不如".toCharArray()); 133 | stopWordDictionary.add("不特".toCharArray()); 134 | stopWordDictionary.add("不惟".toCharArray()); 135 | stopWordDictionary.add("不问".toCharArray()); 136 | stopWordDictionary.add("不只".toCharArray()); 137 | stopWordDictionary.add("朝".toCharArray()); 138 | stopWordDictionary.add("朝着".toCharArray()); 139 | stopWordDictionary.add("趁".toCharArray()); 140 | stopWordDictionary.add("趁着".toCharArray()); 141 | stopWordDictionary.add("乘".toCharArray()); 142 | stopWordDictionary.add("冲".toCharArray()); 143 | stopWordDictionary.add("除".toCharArray()); 144 | stopWordDictionary.add("除此之外".toCharArray()); 145 | stopWordDictionary.add("除非".toCharArray()); 146 | stopWordDictionary.add("除了".toCharArray()); 147 | stopWordDictionary.add("此".toCharArray()); 148 | stopWordDictionary.add("此间".toCharArray()); 149 | stopWordDictionary.add("此外".toCharArray()); 150 | stopWordDictionary.add("从".toCharArray()); 151 | stopWordDictionary.add("从而".toCharArray()); 152 | stopWordDictionary.add("打".toCharArray()); 153 | stopWordDictionary.add("待".toCharArray()); 154 | stopWordDictionary.add("但".toCharArray()); 155 | stopWordDictionary.add("但是".toCharArray()); 156 | stopWordDictionary.add("当".toCharArray()); 157 | stopWordDictionary.add("当着".toCharArray()); 158 | stopWordDictionary.add("到".toCharArray()); 159 | stopWordDictionary.add("得".toCharArray()); 160 | stopWordDictionary.add("的".toCharArray()); 161 | stopWordDictionary.add("的话".toCharArray()); 162 | stopWordDictionary.add("等".toCharArray()); 163 | stopWordDictionary.add("等等".toCharArray()); 164 | stopWordDictionary.add("地".toCharArray()); 165 | stopWordDictionary.add("第".toCharArray()); 166 | stopWordDictionary.add("叮咚".toCharArray()); 167 | stopWordDictionary.add("对".toCharArray()); 168 | stopWordDictionary.add("对于".toCharArray()); 169 | stopWordDictionary.add("多".toCharArray()); 170 | stopWordDictionary.add("多少".toCharArray()); 171 | stopWordDictionary.add("而".toCharArray()); 172 | stopWordDictionary.add("而况".toCharArray()); 173 | stopWordDictionary.add("而且".toCharArray()); 174 | stopWordDictionary.add("而是".toCharArray()); 175 | stopWordDictionary.add("而外".toCharArray()); 176 | stopWordDictionary.add("而言".toCharArray()); 177 | stopWordDictionary.add("而已".toCharArray()); 178 | stopWordDictionary.add("尔后".toCharArray()); 179 | stopWordDictionary.add("反过来".toCharArray()); 180 | stopWordDictionary.add("反过来说".toCharArray()); 181 | stopWordDictionary.add("反之".toCharArray()); 182 | stopWordDictionary.add("非但".toCharArray()); 183 | stopWordDictionary.add("非徒".toCharArray()); 184 | stopWordDictionary.add("否则".toCharArray()); 185 | stopWordDictionary.add("嘎".toCharArray()); 186 | stopWordDictionary.add("嘎登".toCharArray()); 187 | stopWordDictionary.add("该".toCharArray()); 188 | stopWordDictionary.add("赶".toCharArray()); 189 | stopWordDictionary.add("个".toCharArray()); 190 | stopWordDictionary.add("各".toCharArray()); 191 | stopWordDictionary.add("各个".toCharArray()); 192 | stopWordDictionary.add("各位".toCharArray()); 193 | stopWordDictionary.add("各种".toCharArray()); 194 | stopWordDictionary.add("各自".toCharArray()); 195 | stopWordDictionary.add("给".toCharArray()); 196 | stopWordDictionary.add("根据".toCharArray()); 197 | stopWordDictionary.add("跟".toCharArray()); 198 | stopWordDictionary.add("故".toCharArray()); 199 | stopWordDictionary.add("故此".toCharArray()); 200 | stopWordDictionary.add("固然".toCharArray()); 201 | stopWordDictionary.add("关于".toCharArray()); 202 | stopWordDictionary.add("管".toCharArray()); 203 | stopWordDictionary.add("归".toCharArray()); 204 | stopWordDictionary.add("果然".toCharArray()); 205 | stopWordDictionary.add("果真".toCharArray()); 206 | stopWordDictionary.add("过".toCharArray()); 207 | stopWordDictionary.add("哈".toCharArray()); 208 | stopWordDictionary.add("哈哈".toCharArray()); 209 | stopWordDictionary.add("呵".toCharArray()); 210 | stopWordDictionary.add("和".toCharArray()); 211 | stopWordDictionary.add("何".toCharArray()); 212 | stopWordDictionary.add("何处".toCharArray()); 213 | stopWordDictionary.add("何况".toCharArray()); 214 | stopWordDictionary.add("何时".toCharArray()); 215 | stopWordDictionary.add("嘿".toCharArray()); 216 | stopWordDictionary.add("哼".toCharArray()); 217 | stopWordDictionary.add("哼唷".toCharArray()); 218 | stopWordDictionary.add("呼哧".toCharArray()); 219 | stopWordDictionary.add("乎".toCharArray()); 220 | stopWordDictionary.add("哗".toCharArray()); 221 | stopWordDictionary.add("还是".toCharArray()); 222 | stopWordDictionary.add("还有".toCharArray()); 223 | stopWordDictionary.add("换句话说".toCharArray()); 224 | stopWordDictionary.add("换言之".toCharArray()); 225 | stopWordDictionary.add("或".toCharArray()); 226 | stopWordDictionary.add("或是".toCharArray()); 227 | stopWordDictionary.add("或者".toCharArray()); 228 | stopWordDictionary.add("极了".toCharArray()); 229 | stopWordDictionary.add("及".toCharArray()); 230 | stopWordDictionary.add("及其".toCharArray()); 231 | stopWordDictionary.add("及至".toCharArray()); 232 | stopWordDictionary.add("即".toCharArray()); 233 | stopWordDictionary.add("即便".toCharArray()); 234 | stopWordDictionary.add("即或".toCharArray()); 235 | stopWordDictionary.add("即令".toCharArray()); 236 | stopWordDictionary.add("即若".toCharArray()); 237 | stopWordDictionary.add("即使".toCharArray()); 238 | stopWordDictionary.add("几".toCharArray()); 239 | stopWordDictionary.add("几时".toCharArray()); 240 | stopWordDictionary.add("己".toCharArray()); 241 | stopWordDictionary.add("既".toCharArray()); 242 | stopWordDictionary.add("既然".toCharArray()); 243 | stopWordDictionary.add("既是".toCharArray()); 244 | stopWordDictionary.add("继而".toCharArray()); 245 | stopWordDictionary.add("加之".toCharArray()); 246 | stopWordDictionary.add("假如".toCharArray()); 247 | stopWordDictionary.add("假若".toCharArray()); 248 | stopWordDictionary.add("假使".toCharArray()); 249 | stopWordDictionary.add("鉴于".toCharArray()); 250 | stopWordDictionary.add("将".toCharArray()); 251 | stopWordDictionary.add("较".toCharArray()); 252 | stopWordDictionary.add("较之".toCharArray()); 253 | stopWordDictionary.add("叫".toCharArray()); 254 | stopWordDictionary.add("接着".toCharArray()); 255 | stopWordDictionary.add("结果".toCharArray()); 256 | stopWordDictionary.add("借".toCharArray()); 257 | stopWordDictionary.add("紧接着".toCharArray()); 258 | stopWordDictionary.add("进而".toCharArray()); 259 | stopWordDictionary.add("尽".toCharArray()); 260 | stopWordDictionary.add("尽管".toCharArray()); 261 | stopWordDictionary.add("经".toCharArray()); 262 | stopWordDictionary.add("经过".toCharArray()); 263 | stopWordDictionary.add("就".toCharArray()); 264 | stopWordDictionary.add("就是".toCharArray()); 265 | stopWordDictionary.add("就是说".toCharArray()); 266 | stopWordDictionary.add("据".toCharArray()); 267 | stopWordDictionary.add("具体地说".toCharArray()); 268 | stopWordDictionary.add("具体说来".toCharArray()); 269 | stopWordDictionary.add("开始".toCharArray()); 270 | stopWordDictionary.add("开外".toCharArray()); 271 | stopWordDictionary.add("靠".toCharArray()); 272 | stopWordDictionary.add("咳".toCharArray()); 273 | stopWordDictionary.add("可".toCharArray()); 274 | stopWordDictionary.add("可见".toCharArray()); 275 | stopWordDictionary.add("可是".toCharArray()); 276 | stopWordDictionary.add("可以".toCharArray()); 277 | stopWordDictionary.add("况且".toCharArray()); 278 | stopWordDictionary.add("啦".toCharArray()); 279 | stopWordDictionary.add("来".toCharArray()); 280 | stopWordDictionary.add("来着".toCharArray()); 281 | stopWordDictionary.add("离".toCharArray()); 282 | stopWordDictionary.add("例如".toCharArray()); 283 | stopWordDictionary.add("哩".toCharArray()); 284 | stopWordDictionary.add("连".toCharArray()); 285 | stopWordDictionary.add("连同".toCharArray()); 286 | stopWordDictionary.add("两者".toCharArray()); 287 | stopWordDictionary.add("了".toCharArray()); 288 | stopWordDictionary.add("临".toCharArray()); 289 | stopWordDictionary.add("另".toCharArray()); 290 | stopWordDictionary.add("另外".toCharArray()); 291 | stopWordDictionary.add("另一方面".toCharArray()); 292 | stopWordDictionary.add("论".toCharArray()); 293 | stopWordDictionary.add("嘛".toCharArray()); 294 | stopWordDictionary.add("吗".toCharArray()); 295 | stopWordDictionary.add("慢说".toCharArray()); 296 | stopWordDictionary.add("漫说".toCharArray()); 297 | stopWordDictionary.add("冒".toCharArray()); 298 | stopWordDictionary.add("么".toCharArray()); 299 | stopWordDictionary.add("每".toCharArray()); 300 | stopWordDictionary.add("每当".toCharArray()); 301 | stopWordDictionary.add("们".toCharArray()); 302 | stopWordDictionary.add("莫若".toCharArray()); 303 | stopWordDictionary.add("某".toCharArray()); 304 | stopWordDictionary.add("某个".toCharArray()); 305 | stopWordDictionary.add("某些".toCharArray()); 306 | stopWordDictionary.add("拿".toCharArray()); 307 | stopWordDictionary.add("哪".toCharArray()); 308 | stopWordDictionary.add("哪边".toCharArray()); 309 | stopWordDictionary.add("哪儿".toCharArray()); 310 | stopWordDictionary.add("哪个".toCharArray()); 311 | stopWordDictionary.add("哪里".toCharArray()); 312 | stopWordDictionary.add("哪年".toCharArray()); 313 | stopWordDictionary.add("哪怕".toCharArray()); 314 | stopWordDictionary.add("哪天".toCharArray()); 315 | stopWordDictionary.add("哪些".toCharArray()); 316 | stopWordDictionary.add("哪样".toCharArray()); 317 | stopWordDictionary.add("那".toCharArray()); 318 | stopWordDictionary.add("那边".toCharArray()); 319 | stopWordDictionary.add("那儿".toCharArray()); 320 | stopWordDictionary.add("那个".toCharArray()); 321 | stopWordDictionary.add("那会儿".toCharArray()); 322 | stopWordDictionary.add("那里".toCharArray()); 323 | stopWordDictionary.add("那么".toCharArray()); 324 | stopWordDictionary.add("那么些".toCharArray()); 325 | stopWordDictionary.add("那么样".toCharArray()); 326 | stopWordDictionary.add("那时".toCharArray()); 327 | stopWordDictionary.add("那些".toCharArray()); 328 | stopWordDictionary.add("那样".toCharArray()); 329 | stopWordDictionary.add("乃".toCharArray()); 330 | stopWordDictionary.add("乃至".toCharArray()); 331 | stopWordDictionary.add("呢".toCharArray()); 332 | stopWordDictionary.add("能".toCharArray()); 333 | stopWordDictionary.add("你".toCharArray()); 334 | stopWordDictionary.add("你们".toCharArray()); 335 | stopWordDictionary.add("您".toCharArray()); 336 | stopWordDictionary.add("宁".toCharArray()); 337 | stopWordDictionary.add("宁可".toCharArray()); 338 | stopWordDictionary.add("宁肯".toCharArray()); 339 | stopWordDictionary.add("宁愿".toCharArray()); 340 | stopWordDictionary.add("哦".toCharArray()); 341 | stopWordDictionary.add("呕".toCharArray()); 342 | stopWordDictionary.add("啪达".toCharArray()); 343 | stopWordDictionary.add("旁人".toCharArray()); 344 | stopWordDictionary.add("呸".toCharArray()); 345 | stopWordDictionary.add("凭".toCharArray()); 346 | stopWordDictionary.add("凭借".toCharArray()); 347 | stopWordDictionary.add("其".toCharArray()); 348 | stopWordDictionary.add("其次".toCharArray()); 349 | stopWordDictionary.add("其二".toCharArray()); 350 | stopWordDictionary.add("其他".toCharArray()); 351 | stopWordDictionary.add("其它".toCharArray()); 352 | stopWordDictionary.add("其一".toCharArray()); 353 | stopWordDictionary.add("其余".toCharArray()); 354 | stopWordDictionary.add("其中".toCharArray()); 355 | stopWordDictionary.add("起".toCharArray()); 356 | stopWordDictionary.add("起见".toCharArray()); 357 | stopWordDictionary.add("起见".toCharArray()); 358 | stopWordDictionary.add("岂但".toCharArray()); 359 | stopWordDictionary.add("恰恰相反".toCharArray()); 360 | stopWordDictionary.add("前后".toCharArray()); 361 | stopWordDictionary.add("前者".toCharArray()); 362 | stopWordDictionary.add("且".toCharArray()); 363 | stopWordDictionary.add("然而".toCharArray()); 364 | stopWordDictionary.add("然后".toCharArray()); 365 | stopWordDictionary.add("然则".toCharArray()); 366 | stopWordDictionary.add("让".toCharArray()); 367 | stopWordDictionary.add("人家".toCharArray()); 368 | stopWordDictionary.add("任".toCharArray()); 369 | stopWordDictionary.add("任何".toCharArray()); 370 | stopWordDictionary.add("任凭".toCharArray()); 371 | stopWordDictionary.add("如".toCharArray()); 372 | stopWordDictionary.add("如此".toCharArray()); 373 | stopWordDictionary.add("如果".toCharArray()); 374 | stopWordDictionary.add("如何".toCharArray()); 375 | stopWordDictionary.add("如其".toCharArray()); 376 | stopWordDictionary.add("如若".toCharArray()); 377 | stopWordDictionary.add("如上所述".toCharArray()); 378 | stopWordDictionary.add("若".toCharArray()); 379 | stopWordDictionary.add("若非".toCharArray()); 380 | stopWordDictionary.add("若是".toCharArray()); 381 | stopWordDictionary.add("啥".toCharArray()); 382 | stopWordDictionary.add("上下".toCharArray()); 383 | stopWordDictionary.add("尚且".toCharArray()); 384 | stopWordDictionary.add("设若".toCharArray()); 385 | stopWordDictionary.add("设使".toCharArray()); 386 | stopWordDictionary.add("甚而".toCharArray()); 387 | stopWordDictionary.add("甚么".toCharArray()); 388 | stopWordDictionary.add("甚至".toCharArray()); 389 | stopWordDictionary.add("省得".toCharArray()); 390 | stopWordDictionary.add("时候".toCharArray()); 391 | stopWordDictionary.add("什么".toCharArray()); 392 | stopWordDictionary.add("什么样".toCharArray()); 393 | stopWordDictionary.add("使得".toCharArray()); 394 | stopWordDictionary.add("是".toCharArray()); 395 | stopWordDictionary.add("是的".toCharArray()); 396 | stopWordDictionary.add("首先".toCharArray()); 397 | stopWordDictionary.add("谁".toCharArray()); 398 | stopWordDictionary.add("谁知".toCharArray()); 399 | stopWordDictionary.add("顺".toCharArray()); 400 | stopWordDictionary.add("顺着".toCharArray()); 401 | stopWordDictionary.add("似的".toCharArray()); 402 | stopWordDictionary.add("虽".toCharArray()); 403 | stopWordDictionary.add("虽然".toCharArray()); 404 | stopWordDictionary.add("虽说".toCharArray()); 405 | stopWordDictionary.add("虽则".toCharArray()); 406 | stopWordDictionary.add("随".toCharArray()); 407 | stopWordDictionary.add("随着".toCharArray()); 408 | stopWordDictionary.add("所".toCharArray()); 409 | stopWordDictionary.add("所以".toCharArray()); 410 | stopWordDictionary.add("他".toCharArray()); 411 | stopWordDictionary.add("他们".toCharArray()); 412 | stopWordDictionary.add("他人".toCharArray()); 413 | stopWordDictionary.add("它".toCharArray()); 414 | stopWordDictionary.add("它们".toCharArray()); 415 | stopWordDictionary.add("她".toCharArray()); 416 | stopWordDictionary.add("她们".toCharArray()); 417 | stopWordDictionary.add("倘".toCharArray()); 418 | stopWordDictionary.add("倘或".toCharArray()); 419 | stopWordDictionary.add("倘然".toCharArray()); 420 | stopWordDictionary.add("倘若".toCharArray()); 421 | stopWordDictionary.add("倘使".toCharArray()); 422 | stopWordDictionary.add("腾".toCharArray()); 423 | stopWordDictionary.add("替".toCharArray()); 424 | stopWordDictionary.add("通过".toCharArray()); 425 | stopWordDictionary.add("同".toCharArray()); 426 | stopWordDictionary.add("同时".toCharArray()); 427 | stopWordDictionary.add("哇".toCharArray()); 428 | stopWordDictionary.add("万一".toCharArray()); 429 | stopWordDictionary.add("往".toCharArray()); 430 | stopWordDictionary.add("望".toCharArray()); 431 | stopWordDictionary.add("为".toCharArray()); 432 | stopWordDictionary.add("为何".toCharArray()); 433 | stopWordDictionary.add("为了".toCharArray()); 434 | stopWordDictionary.add("为什么".toCharArray()); 435 | stopWordDictionary.add("为着".toCharArray()); 436 | stopWordDictionary.add("喂".toCharArray()); 437 | stopWordDictionary.add("嗡嗡".toCharArray()); 438 | stopWordDictionary.add("我".toCharArray()); 439 | stopWordDictionary.add("我们".toCharArray()); 440 | stopWordDictionary.add("呜".toCharArray()); 441 | stopWordDictionary.add("呜呼".toCharArray()); 442 | stopWordDictionary.add("乌乎".toCharArray()); 443 | stopWordDictionary.add("无论".toCharArray()); 444 | stopWordDictionary.add("无宁".toCharArray()); 445 | stopWordDictionary.add("毋宁".toCharArray()); 446 | stopWordDictionary.add("嘻".toCharArray()); 447 | stopWordDictionary.add("吓".toCharArray()); 448 | stopWordDictionary.add("相对而言".toCharArray()); 449 | stopWordDictionary.add("像".toCharArray()); 450 | stopWordDictionary.add("向".toCharArray()); 451 | stopWordDictionary.add("向着".toCharArray()); 452 | stopWordDictionary.add("嘘".toCharArray()); 453 | stopWordDictionary.add("呀".toCharArray()); 454 | stopWordDictionary.add("焉".toCharArray()); 455 | stopWordDictionary.add("沿".toCharArray()); 456 | stopWordDictionary.add("沿着".toCharArray()); 457 | stopWordDictionary.add("要".toCharArray()); 458 | stopWordDictionary.add("要不".toCharArray()); 459 | stopWordDictionary.add("要不然".toCharArray()); 460 | stopWordDictionary.add("要不是".toCharArray()); 461 | stopWordDictionary.add("要么".toCharArray()); 462 | stopWordDictionary.add("要是".toCharArray()); 463 | stopWordDictionary.add("也".toCharArray()); 464 | stopWordDictionary.add("也罢".toCharArray()); 465 | stopWordDictionary.add("也好".toCharArray()); 466 | stopWordDictionary.add("一".toCharArray()); 467 | stopWordDictionary.add("一般".toCharArray()); 468 | stopWordDictionary.add("一旦".toCharArray()); 469 | stopWordDictionary.add("一方面".toCharArray()); 470 | stopWordDictionary.add("一来".toCharArray()); 471 | stopWordDictionary.add("一切".toCharArray()); 472 | stopWordDictionary.add("一样".toCharArray()); 473 | stopWordDictionary.add("一则".toCharArray()); 474 | stopWordDictionary.add("依".toCharArray()); 475 | stopWordDictionary.add("依照".toCharArray()); 476 | stopWordDictionary.add("矣".toCharArray()); 477 | stopWordDictionary.add("以".toCharArray()); 478 | stopWordDictionary.add("以便".toCharArray()); 479 | stopWordDictionary.add("以及".toCharArray()); 480 | stopWordDictionary.add("以免".toCharArray()); 481 | stopWordDictionary.add("以至".toCharArray()); 482 | stopWordDictionary.add("以至于".toCharArray()); 483 | stopWordDictionary.add("以致".toCharArray()); 484 | stopWordDictionary.add("抑或".toCharArray()); 485 | stopWordDictionary.add("因".toCharArray()); 486 | stopWordDictionary.add("因此".toCharArray()); 487 | stopWordDictionary.add("因而".toCharArray()); 488 | stopWordDictionary.add("因为".toCharArray()); 489 | stopWordDictionary.add("哟".toCharArray()); 490 | stopWordDictionary.add("用".toCharArray()); 491 | stopWordDictionary.add("由".toCharArray()); 492 | stopWordDictionary.add("由此可见".toCharArray()); 493 | stopWordDictionary.add("由于".toCharArray()); 494 | stopWordDictionary.add("有".toCharArray()); 495 | stopWordDictionary.add("有的".toCharArray()); 496 | stopWordDictionary.add("有关".toCharArray()); 497 | stopWordDictionary.add("有些".toCharArray()); 498 | stopWordDictionary.add("又".toCharArray()); 499 | stopWordDictionary.add("于".toCharArray()); 500 | stopWordDictionary.add("于是".toCharArray()); 501 | stopWordDictionary.add("于是乎".toCharArray()); 502 | stopWordDictionary.add("与".toCharArray()); 503 | stopWordDictionary.add("与此同时".toCharArray()); 504 | stopWordDictionary.add("与否".toCharArray()); 505 | stopWordDictionary.add("与其".toCharArray()); 506 | stopWordDictionary.add("越是".toCharArray()); 507 | stopWordDictionary.add("云云".toCharArray()); 508 | stopWordDictionary.add("哉".toCharArray()); 509 | stopWordDictionary.add("再说".toCharArray()); 510 | stopWordDictionary.add("再者".toCharArray()); 511 | stopWordDictionary.add("在".toCharArray()); 512 | stopWordDictionary.add("在下".toCharArray()); 513 | stopWordDictionary.add("咱".toCharArray()); 514 | stopWordDictionary.add("咱们".toCharArray()); 515 | stopWordDictionary.add("则".toCharArray()); 516 | stopWordDictionary.add("怎".toCharArray()); 517 | stopWordDictionary.add("怎么".toCharArray()); 518 | stopWordDictionary.add("怎么办".toCharArray()); 519 | stopWordDictionary.add("怎么样".toCharArray()); 520 | stopWordDictionary.add("怎样".toCharArray()); 521 | stopWordDictionary.add("咋".toCharArray()); 522 | stopWordDictionary.add("照".toCharArray()); 523 | stopWordDictionary.add("照着".toCharArray()); 524 | stopWordDictionary.add("者".toCharArray()); 525 | stopWordDictionary.add("这".toCharArray()); 526 | stopWordDictionary.add("这边".toCharArray()); 527 | stopWordDictionary.add("这儿".toCharArray()); 528 | stopWordDictionary.add("这个".toCharArray()); 529 | stopWordDictionary.add("这会儿".toCharArray()); 530 | stopWordDictionary.add("这就是说".toCharArray()); 531 | stopWordDictionary.add("这里".toCharArray()); 532 | stopWordDictionary.add("这么".toCharArray()); 533 | stopWordDictionary.add("这么点儿".toCharArray()); 534 | stopWordDictionary.add("这么些".toCharArray()); 535 | stopWordDictionary.add("这么样".toCharArray()); 536 | stopWordDictionary.add("这时".toCharArray()); 537 | stopWordDictionary.add("这些".toCharArray()); 538 | stopWordDictionary.add("这样".toCharArray()); 539 | stopWordDictionary.add("正如".toCharArray()); 540 | stopWordDictionary.add("a".toCharArray()); 541 | stopWordDictionary.add("an".toCharArray()); 542 | stopWordDictionary.add("and".toCharArray()); 543 | stopWordDictionary.add("are".toCharArray()); 544 | stopWordDictionary.add("as".toCharArray()); 545 | stopWordDictionary.add("at".toCharArray()); 546 | stopWordDictionary.add("be".toCharArray()); 547 | stopWordDictionary.add("but".toCharArray()); 548 | stopWordDictionary.add("by".toCharArray()); 549 | stopWordDictionary.add("for".toCharArray()); 550 | stopWordDictionary.add("if".toCharArray()); 551 | stopWordDictionary.add("in".toCharArray()); 552 | stopWordDictionary.add("into".toCharArray()); 553 | stopWordDictionary.add("is".toCharArray()); 554 | stopWordDictionary.add("it".toCharArray()); 555 | stopWordDictionary.add("no".toCharArray()); 556 | stopWordDictionary.add("not".toCharArray()); 557 | stopWordDictionary.add("of".toCharArray()); 558 | stopWordDictionary.add("on".toCharArray()); 559 | stopWordDictionary.add("or".toCharArray()); 560 | stopWordDictionary.add("such".toCharArray()); 561 | stopWordDictionary.add("that".toCharArray()); 562 | stopWordDictionary.add("the".toCharArray()); 563 | stopWordDictionary.add("their".toCharArray()); 564 | stopWordDictionary.add("then".toCharArray()); 565 | stopWordDictionary.add("there".toCharArray()); 566 | stopWordDictionary.add("these".toCharArray()); 567 | stopWordDictionary.add("they".toCharArray()); 568 | stopWordDictionary.add("this".toCharArray()); 569 | stopWordDictionary.add("to".toCharArray()); 570 | stopWordDictionary.add("was".toCharArray()); 571 | stopWordDictionary.add("will".toCharArray()); 572 | stopWordDictionary.add("with".toCharArray()); 573 | stopWordDictionary.add("更好的".toCharArray()); 574 | stopWordDictionary.add("选择".toCharArray()); 575 | stopWordDictionary.add("啊".toCharArray()); 576 | stopWordDictionary.add("阿".toCharArray()); 577 | stopWordDictionary.add("哎".toCharArray()); 578 | stopWordDictionary.add("哎呀".toCharArray()); 579 | stopWordDictionary.add("哎哟".toCharArray()); 580 | stopWordDictionary.add("唉".toCharArray()); 581 | stopWordDictionary.add("俺".toCharArray()); 582 | stopWordDictionary.add("俺们".toCharArray()); 583 | stopWordDictionary.add("按".toCharArray()); 584 | stopWordDictionary.add("按照".toCharArray()); 585 | stopWordDictionary.add("吧".toCharArray()); 586 | stopWordDictionary.add("吧哒".toCharArray()); 587 | stopWordDictionary.add("把".toCharArray()); 588 | stopWordDictionary.add("罢了".toCharArray()); 589 | stopWordDictionary.add("被".toCharArray()); 590 | stopWordDictionary.add("本".toCharArray()); 591 | stopWordDictionary.add("本着".toCharArray()); 592 | stopWordDictionary.add("比".toCharArray()); 593 | stopWordDictionary.add("比方".toCharArray()); 594 | stopWordDictionary.add("比如".toCharArray()); 595 | stopWordDictionary.add("鄙人".toCharArray()); 596 | stopWordDictionary.add("彼".toCharArray()); 597 | stopWordDictionary.add("彼此".toCharArray()); 598 | stopWordDictionary.add("边".toCharArray()); 599 | stopWordDictionary.add("别".toCharArray()); 600 | stopWordDictionary.add("别的".toCharArray()); 601 | stopWordDictionary.add("别说".toCharArray()); 602 | stopWordDictionary.add("并".toCharArray()); 603 | stopWordDictionary.add("并且".toCharArray()); 604 | stopWordDictionary.add("不比".toCharArray()); 605 | stopWordDictionary.add("不成".toCharArray()); 606 | stopWordDictionary.add("不单".toCharArray()); 607 | stopWordDictionary.add("不但".toCharArray()); 608 | stopWordDictionary.add("不独".toCharArray()); 609 | stopWordDictionary.add("不管".toCharArray()); 610 | stopWordDictionary.add("不光".toCharArray()); 611 | stopWordDictionary.add("不过".toCharArray()); 612 | stopWordDictionary.add("不仅".toCharArray()); 613 | stopWordDictionary.add("不拘".toCharArray()); 614 | stopWordDictionary.add("不论".toCharArray()); 615 | stopWordDictionary.add("不怕".toCharArray()); 616 | stopWordDictionary.add("不然".toCharArray()); 617 | stopWordDictionary.add("不如".toCharArray()); 618 | stopWordDictionary.add("不特".toCharArray()); 619 | stopWordDictionary.add("不惟".toCharArray()); 620 | stopWordDictionary.add("不问".toCharArray()); 621 | stopWordDictionary.add("不只".toCharArray()); 622 | stopWordDictionary.add("朝".toCharArray()); 623 | stopWordDictionary.add("朝着".toCharArray()); 624 | stopWordDictionary.add("趁".toCharArray()); 625 | stopWordDictionary.add("趁着".toCharArray()); 626 | stopWordDictionary.add("乘".toCharArray()); 627 | stopWordDictionary.add("冲".toCharArray()); 628 | stopWordDictionary.add("除".toCharArray()); 629 | stopWordDictionary.add("除此之外".toCharArray()); 630 | stopWordDictionary.add("除非".toCharArray()); 631 | stopWordDictionary.add("除了".toCharArray()); 632 | stopWordDictionary.add("此".toCharArray()); 633 | stopWordDictionary.add("此间".toCharArray()); 634 | stopWordDictionary.add("此外".toCharArray()); 635 | stopWordDictionary.add("从".toCharArray()); 636 | stopWordDictionary.add("从而".toCharArray()); 637 | stopWordDictionary.add("打".toCharArray()); 638 | stopWordDictionary.add("待".toCharArray()); 639 | stopWordDictionary.add("但".toCharArray()); 640 | stopWordDictionary.add("但是".toCharArray()); 641 | stopWordDictionary.add("当".toCharArray()); 642 | stopWordDictionary.add("当着".toCharArray()); 643 | stopWordDictionary.add("到".toCharArray()); 644 | stopWordDictionary.add("得".toCharArray()); 645 | stopWordDictionary.add("的".toCharArray()); 646 | stopWordDictionary.add("的话".toCharArray()); 647 | stopWordDictionary.add("等".toCharArray()); 648 | stopWordDictionary.add("等等".toCharArray()); 649 | stopWordDictionary.add("地".toCharArray()); 650 | stopWordDictionary.add("第".toCharArray()); 651 | stopWordDictionary.add("叮咚".toCharArray()); 652 | stopWordDictionary.add("对".toCharArray()); 653 | stopWordDictionary.add("对于".toCharArray()); 654 | stopWordDictionary.add("多".toCharArray()); 655 | stopWordDictionary.add("多少".toCharArray()); 656 | stopWordDictionary.add("而".toCharArray()); 657 | stopWordDictionary.add("而况".toCharArray()); 658 | stopWordDictionary.add("而且".toCharArray()); 659 | stopWordDictionary.add("而是".toCharArray()); 660 | stopWordDictionary.add("而外".toCharArray()); 661 | stopWordDictionary.add("而言".toCharArray()); 662 | stopWordDictionary.add("而已".toCharArray()); 663 | stopWordDictionary.add("尔后".toCharArray()); 664 | stopWordDictionary.add("反过来".toCharArray()); 665 | stopWordDictionary.add("反过来说".toCharArray()); 666 | stopWordDictionary.add("反之".toCharArray()); 667 | stopWordDictionary.add("非但".toCharArray()); 668 | stopWordDictionary.add("非徒".toCharArray()); 669 | stopWordDictionary.add("否则".toCharArray()); 670 | stopWordDictionary.add("嘎".toCharArray()); 671 | stopWordDictionary.add("嘎登".toCharArray()); 672 | stopWordDictionary.add("该".toCharArray()); 673 | stopWordDictionary.add("赶".toCharArray()); 674 | stopWordDictionary.add("个".toCharArray()); 675 | stopWordDictionary.add("各".toCharArray()); 676 | stopWordDictionary.add("各个".toCharArray()); 677 | stopWordDictionary.add("各位".toCharArray()); 678 | stopWordDictionary.add("各种".toCharArray()); 679 | stopWordDictionary.add("各自".toCharArray()); 680 | stopWordDictionary.add("给".toCharArray()); 681 | stopWordDictionary.add("根据".toCharArray()); 682 | stopWordDictionary.add("跟".toCharArray()); 683 | stopWordDictionary.add("故".toCharArray()); 684 | stopWordDictionary.add("故此".toCharArray()); 685 | stopWordDictionary.add("固然".toCharArray()); 686 | stopWordDictionary.add("关于".toCharArray()); 687 | stopWordDictionary.add("管".toCharArray()); 688 | stopWordDictionary.add("归".toCharArray()); 689 | stopWordDictionary.add("果然".toCharArray()); 690 | stopWordDictionary.add("果真".toCharArray()); 691 | stopWordDictionary.add("过".toCharArray()); 692 | stopWordDictionary.add("哈".toCharArray()); 693 | stopWordDictionary.add("哈哈".toCharArray()); 694 | stopWordDictionary.add("呵".toCharArray()); 695 | stopWordDictionary.add("和".toCharArray()); 696 | stopWordDictionary.add("何".toCharArray()); 697 | stopWordDictionary.add("何处".toCharArray()); 698 | stopWordDictionary.add("何况".toCharArray()); 699 | stopWordDictionary.add("何时".toCharArray()); 700 | stopWordDictionary.add("嘿".toCharArray()); 701 | stopWordDictionary.add("哼".toCharArray()); 702 | stopWordDictionary.add("哼唷".toCharArray()); 703 | stopWordDictionary.add("呼哧".toCharArray()); 704 | stopWordDictionary.add("乎".toCharArray()); 705 | stopWordDictionary.add("哗".toCharArray()); 706 | stopWordDictionary.add("还是".toCharArray()); 707 | stopWordDictionary.add("还有".toCharArray()); 708 | stopWordDictionary.add("换句话说".toCharArray()); 709 | stopWordDictionary.add("换言之".toCharArray()); 710 | stopWordDictionary.add("或".toCharArray()); 711 | stopWordDictionary.add("或是".toCharArray()); 712 | stopWordDictionary.add("或者".toCharArray()); 713 | stopWordDictionary.add("极了".toCharArray()); 714 | stopWordDictionary.add("及".toCharArray()); 715 | stopWordDictionary.add("及其".toCharArray()); 716 | stopWordDictionary.add("及至".toCharArray()); 717 | stopWordDictionary.add("即".toCharArray()); 718 | stopWordDictionary.add("即便".toCharArray()); 719 | stopWordDictionary.add("即或".toCharArray()); 720 | stopWordDictionary.add("即令".toCharArray()); 721 | stopWordDictionary.add("即若".toCharArray()); 722 | stopWordDictionary.add("即使".toCharArray()); 723 | stopWordDictionary.add("几".toCharArray()); 724 | stopWordDictionary.add("几时".toCharArray()); 725 | stopWordDictionary.add("己".toCharArray()); 726 | stopWordDictionary.add("既".toCharArray()); 727 | stopWordDictionary.add("既然".toCharArray()); 728 | stopWordDictionary.add("既是".toCharArray()); 729 | stopWordDictionary.add("继而".toCharArray()); 730 | stopWordDictionary.add("加之".toCharArray()); 731 | stopWordDictionary.add("假如".toCharArray()); 732 | stopWordDictionary.add("假若".toCharArray()); 733 | stopWordDictionary.add("假使".toCharArray()); 734 | stopWordDictionary.add("鉴于".toCharArray()); 735 | stopWordDictionary.add("将".toCharArray()); 736 | stopWordDictionary.add("较".toCharArray()); 737 | stopWordDictionary.add("较之".toCharArray()); 738 | stopWordDictionary.add("叫".toCharArray()); 739 | stopWordDictionary.add("接着".toCharArray()); 740 | stopWordDictionary.add("结果".toCharArray()); 741 | stopWordDictionary.add("借".toCharArray()); 742 | stopWordDictionary.add("紧接着".toCharArray()); 743 | stopWordDictionary.add("进而".toCharArray()); 744 | stopWordDictionary.add("尽".toCharArray()); 745 | stopWordDictionary.add("尽管".toCharArray()); 746 | stopWordDictionary.add("经".toCharArray()); 747 | stopWordDictionary.add("经过".toCharArray()); 748 | stopWordDictionary.add("就".toCharArray()); 749 | stopWordDictionary.add("就是".toCharArray()); 750 | stopWordDictionary.add("就是说".toCharArray()); 751 | stopWordDictionary.add("据".toCharArray()); 752 | stopWordDictionary.add("具体地说".toCharArray()); 753 | stopWordDictionary.add("具体说来".toCharArray()); 754 | stopWordDictionary.add("开始".toCharArray()); 755 | stopWordDictionary.add("开外".toCharArray()); 756 | stopWordDictionary.add("靠".toCharArray()); 757 | stopWordDictionary.add("咳".toCharArray()); 758 | stopWordDictionary.add("可".toCharArray()); 759 | stopWordDictionary.add("可见".toCharArray()); 760 | stopWordDictionary.add("可是".toCharArray()); 761 | stopWordDictionary.add("可以".toCharArray()); 762 | stopWordDictionary.add("况且".toCharArray()); 763 | stopWordDictionary.add("啦".toCharArray()); 764 | stopWordDictionary.add("来".toCharArray()); 765 | stopWordDictionary.add("来着".toCharArray()); 766 | stopWordDictionary.add("离".toCharArray()); 767 | stopWordDictionary.add("例如".toCharArray()); 768 | stopWordDictionary.add("哩".toCharArray()); 769 | stopWordDictionary.add("连".toCharArray()); 770 | stopWordDictionary.add("连同".toCharArray()); 771 | stopWordDictionary.add("两者".toCharArray()); 772 | stopWordDictionary.add("了".toCharArray()); 773 | stopWordDictionary.add("临".toCharArray()); 774 | stopWordDictionary.add("另".toCharArray()); 775 | stopWordDictionary.add("另外".toCharArray()); 776 | stopWordDictionary.add("另一方面".toCharArray()); 777 | stopWordDictionary.add("论".toCharArray()); 778 | stopWordDictionary.add("嘛".toCharArray()); 779 | stopWordDictionary.add("吗".toCharArray()); 780 | stopWordDictionary.add("慢说".toCharArray()); 781 | stopWordDictionary.add("漫说".toCharArray()); 782 | stopWordDictionary.add("冒".toCharArray()); 783 | stopWordDictionary.add("么".toCharArray()); 784 | stopWordDictionary.add("每".toCharArray()); 785 | stopWordDictionary.add("每当".toCharArray()); 786 | stopWordDictionary.add("们".toCharArray()); 787 | stopWordDictionary.add("莫若".toCharArray()); 788 | stopWordDictionary.add("某".toCharArray()); 789 | stopWordDictionary.add("某个".toCharArray()); 790 | stopWordDictionary.add("某些".toCharArray()); 791 | stopWordDictionary.add("拿".toCharArray()); 792 | stopWordDictionary.add("哪".toCharArray()); 793 | stopWordDictionary.add("哪边".toCharArray()); 794 | stopWordDictionary.add("哪儿".toCharArray()); 795 | stopWordDictionary.add("哪个".toCharArray()); 796 | stopWordDictionary.add("哪里".toCharArray()); 797 | stopWordDictionary.add("哪年".toCharArray()); 798 | stopWordDictionary.add("哪怕".toCharArray()); 799 | stopWordDictionary.add("哪天".toCharArray()); 800 | stopWordDictionary.add("哪些".toCharArray()); 801 | stopWordDictionary.add("哪样".toCharArray()); 802 | stopWordDictionary.add("那".toCharArray()); 803 | stopWordDictionary.add("那边".toCharArray()); 804 | stopWordDictionary.add("那儿".toCharArray()); 805 | stopWordDictionary.add("那个".toCharArray()); 806 | stopWordDictionary.add("那会儿".toCharArray()); 807 | stopWordDictionary.add("那里".toCharArray()); 808 | stopWordDictionary.add("那么".toCharArray()); 809 | stopWordDictionary.add("那么些".toCharArray()); 810 | stopWordDictionary.add("那么样".toCharArray()); 811 | stopWordDictionary.add("那时".toCharArray()); 812 | stopWordDictionary.add("那些".toCharArray()); 813 | stopWordDictionary.add("那样".toCharArray()); 814 | stopWordDictionary.add("乃".toCharArray()); 815 | stopWordDictionary.add("乃至".toCharArray()); 816 | stopWordDictionary.add("呢".toCharArray()); 817 | stopWordDictionary.add("能".toCharArray()); 818 | stopWordDictionary.add("你".toCharArray()); 819 | stopWordDictionary.add("你们".toCharArray()); 820 | stopWordDictionary.add("您".toCharArray()); 821 | stopWordDictionary.add("宁".toCharArray()); 822 | stopWordDictionary.add("宁可".toCharArray()); 823 | stopWordDictionary.add("宁肯".toCharArray()); 824 | stopWordDictionary.add("宁愿".toCharArray()); 825 | stopWordDictionary.add("哦".toCharArray()); 826 | stopWordDictionary.add("呕".toCharArray()); 827 | stopWordDictionary.add("啪达".toCharArray()); 828 | stopWordDictionary.add("旁人".toCharArray()); 829 | stopWordDictionary.add("呸".toCharArray()); 830 | stopWordDictionary.add("凭".toCharArray()); 831 | stopWordDictionary.add("凭借".toCharArray()); 832 | stopWordDictionary.add("其".toCharArray()); 833 | stopWordDictionary.add("其次".toCharArray()); 834 | stopWordDictionary.add("其二".toCharArray()); 835 | stopWordDictionary.add("其他".toCharArray()); 836 | stopWordDictionary.add("其它".toCharArray()); 837 | stopWordDictionary.add("其一".toCharArray()); 838 | stopWordDictionary.add("其余".toCharArray()); 839 | stopWordDictionary.add("其中".toCharArray()); 840 | stopWordDictionary.add("起".toCharArray()); 841 | stopWordDictionary.add("起见".toCharArray()); 842 | stopWordDictionary.add("起见".toCharArray()); 843 | stopWordDictionary.add("岂但".toCharArray()); 844 | stopWordDictionary.add("恰恰相反".toCharArray()); 845 | stopWordDictionary.add("前后".toCharArray()); 846 | stopWordDictionary.add("前者".toCharArray()); 847 | stopWordDictionary.add("且".toCharArray()); 848 | stopWordDictionary.add("然而".toCharArray()); 849 | stopWordDictionary.add("然后".toCharArray()); 850 | stopWordDictionary.add("然则".toCharArray()); 851 | stopWordDictionary.add("让".toCharArray()); 852 | stopWordDictionary.add("人家".toCharArray()); 853 | stopWordDictionary.add("任".toCharArray()); 854 | stopWordDictionary.add("任何".toCharArray()); 855 | stopWordDictionary.add("任凭".toCharArray()); 856 | stopWordDictionary.add("如".toCharArray()); 857 | stopWordDictionary.add("如此".toCharArray()); 858 | stopWordDictionary.add("如果".toCharArray()); 859 | stopWordDictionary.add("如何".toCharArray()); 860 | stopWordDictionary.add("如其".toCharArray()); 861 | stopWordDictionary.add("如若".toCharArray()); 862 | stopWordDictionary.add("如上所述".toCharArray()); 863 | stopWordDictionary.add("若".toCharArray()); 864 | stopWordDictionary.add("若非".toCharArray()); 865 | stopWordDictionary.add("若是".toCharArray()); 866 | stopWordDictionary.add("啥".toCharArray()); 867 | stopWordDictionary.add("上下".toCharArray()); 868 | stopWordDictionary.add("尚且".toCharArray()); 869 | stopWordDictionary.add("设若".toCharArray()); 870 | stopWordDictionary.add("设使".toCharArray()); 871 | stopWordDictionary.add("甚而".toCharArray()); 872 | stopWordDictionary.add("甚么".toCharArray()); 873 | stopWordDictionary.add("甚至".toCharArray()); 874 | stopWordDictionary.add("省得".toCharArray()); 875 | stopWordDictionary.add("时候".toCharArray()); 876 | stopWordDictionary.add("什么".toCharArray()); 877 | stopWordDictionary.add("什么样".toCharArray()); 878 | stopWordDictionary.add("使得".toCharArray()); 879 | stopWordDictionary.add("是".toCharArray()); 880 | stopWordDictionary.add("是的".toCharArray()); 881 | stopWordDictionary.add("首先".toCharArray()); 882 | stopWordDictionary.add("谁".toCharArray()); 883 | stopWordDictionary.add("谁知".toCharArray()); 884 | stopWordDictionary.add("顺".toCharArray()); 885 | stopWordDictionary.add("顺着".toCharArray()); 886 | stopWordDictionary.add("似的".toCharArray()); 887 | stopWordDictionary.add("虽".toCharArray()); 888 | stopWordDictionary.add("虽然".toCharArray()); 889 | stopWordDictionary.add("虽说".toCharArray()); 890 | stopWordDictionary.add("虽则".toCharArray()); 891 | stopWordDictionary.add("随".toCharArray()); 892 | stopWordDictionary.add("随着".toCharArray()); 893 | stopWordDictionary.add("所".toCharArray()); 894 | stopWordDictionary.add("所以".toCharArray()); 895 | stopWordDictionary.add("他".toCharArray()); 896 | stopWordDictionary.add("他们".toCharArray()); 897 | stopWordDictionary.add("他人".toCharArray()); 898 | stopWordDictionary.add("它".toCharArray()); 899 | stopWordDictionary.add("它们".toCharArray()); 900 | stopWordDictionary.add("她".toCharArray()); 901 | stopWordDictionary.add("她们".toCharArray()); 902 | stopWordDictionary.add("倘".toCharArray()); 903 | stopWordDictionary.add("倘或".toCharArray()); 904 | stopWordDictionary.add("倘然".toCharArray()); 905 | stopWordDictionary.add("倘若".toCharArray()); 906 | stopWordDictionary.add("倘使".toCharArray()); 907 | stopWordDictionary.add("腾".toCharArray()); 908 | stopWordDictionary.add("替".toCharArray()); 909 | stopWordDictionary.add("通过".toCharArray()); 910 | stopWordDictionary.add("同".toCharArray()); 911 | stopWordDictionary.add("同时".toCharArray()); 912 | stopWordDictionary.add("哇".toCharArray()); 913 | stopWordDictionary.add("万一".toCharArray()); 914 | stopWordDictionary.add("往".toCharArray()); 915 | stopWordDictionary.add("望".toCharArray()); 916 | stopWordDictionary.add("为".toCharArray()); 917 | stopWordDictionary.add("为何".toCharArray()); 918 | stopWordDictionary.add("为了".toCharArray()); 919 | stopWordDictionary.add("为什么".toCharArray()); 920 | stopWordDictionary.add("为着".toCharArray()); 921 | stopWordDictionary.add("喂".toCharArray()); 922 | stopWordDictionary.add("嗡嗡".toCharArray()); 923 | stopWordDictionary.add("我".toCharArray()); 924 | stopWordDictionary.add("我们".toCharArray()); 925 | stopWordDictionary.add("呜".toCharArray()); 926 | stopWordDictionary.add("呜呼".toCharArray()); 927 | stopWordDictionary.add("乌乎".toCharArray()); 928 | stopWordDictionary.add("无论".toCharArray()); 929 | stopWordDictionary.add("无宁".toCharArray()); 930 | stopWordDictionary.add("毋宁".toCharArray()); 931 | stopWordDictionary.add("嘻".toCharArray()); 932 | stopWordDictionary.add("吓".toCharArray()); 933 | stopWordDictionary.add("相对而言".toCharArray()); 934 | stopWordDictionary.add("像".toCharArray()); 935 | stopWordDictionary.add("向".toCharArray()); 936 | stopWordDictionary.add("向着".toCharArray()); 937 | stopWordDictionary.add("嘘".toCharArray()); 938 | stopWordDictionary.add("呀".toCharArray()); 939 | stopWordDictionary.add("焉".toCharArray()); 940 | stopWordDictionary.add("沿".toCharArray()); 941 | stopWordDictionary.add("沿着".toCharArray()); 942 | stopWordDictionary.add("要".toCharArray()); 943 | stopWordDictionary.add("要不".toCharArray()); 944 | stopWordDictionary.add("要不然".toCharArray()); 945 | stopWordDictionary.add("要不是".toCharArray()); 946 | stopWordDictionary.add("要么".toCharArray()); 947 | stopWordDictionary.add("要是".toCharArray()); 948 | stopWordDictionary.add("也".toCharArray()); 949 | stopWordDictionary.add("也罢".toCharArray()); 950 | stopWordDictionary.add("也好".toCharArray()); 951 | stopWordDictionary.add("一".toCharArray()); 952 | stopWordDictionary.add("一般".toCharArray()); 953 | stopWordDictionary.add("一旦".toCharArray()); 954 | stopWordDictionary.add("一方面".toCharArray()); 955 | stopWordDictionary.add("一来".toCharArray()); 956 | stopWordDictionary.add("一切".toCharArray()); 957 | stopWordDictionary.add("一样".toCharArray()); 958 | stopWordDictionary.add("一则".toCharArray()); 959 | stopWordDictionary.add("依".toCharArray()); 960 | stopWordDictionary.add("依照".toCharArray()); 961 | stopWordDictionary.add("矣".toCharArray()); 962 | stopWordDictionary.add("以".toCharArray()); 963 | stopWordDictionary.add("以便".toCharArray()); 964 | stopWordDictionary.add("以及".toCharArray()); 965 | stopWordDictionary.add("以免".toCharArray()); 966 | stopWordDictionary.add("以至".toCharArray()); 967 | stopWordDictionary.add("以至于".toCharArray()); 968 | stopWordDictionary.add("以致".toCharArray()); 969 | stopWordDictionary.add("抑或".toCharArray()); 970 | stopWordDictionary.add("因".toCharArray()); 971 | stopWordDictionary.add("因此".toCharArray()); 972 | stopWordDictionary.add("因而".toCharArray()); 973 | stopWordDictionary.add("因为".toCharArray()); 974 | stopWordDictionary.add("哟".toCharArray()); 975 | stopWordDictionary.add("用".toCharArray()); 976 | stopWordDictionary.add("由".toCharArray()); 977 | stopWordDictionary.add("由此可见".toCharArray()); 978 | stopWordDictionary.add("由于".toCharArray()); 979 | stopWordDictionary.add("有".toCharArray()); 980 | stopWordDictionary.add("有的".toCharArray()); 981 | stopWordDictionary.add("有关".toCharArray()); 982 | stopWordDictionary.add("有些".toCharArray()); 983 | stopWordDictionary.add("又".toCharArray()); 984 | stopWordDictionary.add("于".toCharArray()); 985 | stopWordDictionary.add("于是".toCharArray()); 986 | stopWordDictionary.add("于是乎".toCharArray()); 987 | stopWordDictionary.add("与".toCharArray()); 988 | stopWordDictionary.add("与此同时".toCharArray()); 989 | stopWordDictionary.add("与否".toCharArray()); 990 | stopWordDictionary.add("与其".toCharArray()); 991 | stopWordDictionary.add("越是".toCharArray()); 992 | stopWordDictionary.add("云云".toCharArray()); 993 | stopWordDictionary.add("哉".toCharArray()); 994 | stopWordDictionary.add("再说".toCharArray()); 995 | stopWordDictionary.add("再者".toCharArray()); 996 | stopWordDictionary.add("在".toCharArray()); 997 | stopWordDictionary.add("在下".toCharArray()); 998 | stopWordDictionary.add("咱".toCharArray()); 999 | stopWordDictionary.add("咱们".toCharArray()); 1000 | stopWordDictionary.add("则".toCharArray()); 1001 | stopWordDictionary.add("怎".toCharArray()); 1002 | stopWordDictionary.add("怎么".toCharArray()); 1003 | stopWordDictionary.add("怎么办".toCharArray()); 1004 | stopWordDictionary.add("怎么样".toCharArray()); 1005 | stopWordDictionary.add("怎样".toCharArray()); 1006 | stopWordDictionary.add("咋".toCharArray()); 1007 | stopWordDictionary.add("照".toCharArray()); 1008 | stopWordDictionary.add("照着".toCharArray()); 1009 | stopWordDictionary.add("者".toCharArray()); 1010 | stopWordDictionary.add("这".toCharArray()); 1011 | stopWordDictionary.add("这边".toCharArray()); 1012 | stopWordDictionary.add("这儿".toCharArray()); 1013 | stopWordDictionary.add("这个".toCharArray()); 1014 | stopWordDictionary.add("这会儿".toCharArray()); 1015 | stopWordDictionary.add("这就是说".toCharArray()); 1016 | stopWordDictionary.add("这里".toCharArray()); 1017 | stopWordDictionary.add("这么".toCharArray()); 1018 | stopWordDictionary.add("这么点儿".toCharArray()); 1019 | stopWordDictionary.add("这么些".toCharArray()); 1020 | stopWordDictionary.add("这么样".toCharArray()); 1021 | stopWordDictionary.add("这时".toCharArray()); 1022 | stopWordDictionary.add("这些".toCharArray()); 1023 | stopWordDictionary.add("这样".toCharArray()); 1024 | stopWordDictionary.add("正如".toCharArray()); 1025 | stopWordDictionary.add("吱".toCharArray()); 1026 | stopWordDictionary.add("之".toCharArray()); 1027 | stopWordDictionary.add("之类".toCharArray()); 1028 | stopWordDictionary.add("之所以".toCharArray()); 1029 | stopWordDictionary.add("之一".toCharArray()); 1030 | stopWordDictionary.add("只是".toCharArray()); 1031 | stopWordDictionary.add("只限".toCharArray()); 1032 | stopWordDictionary.add("只要".toCharArray()); 1033 | stopWordDictionary.add("只有".toCharArray()); 1034 | stopWordDictionary.add("至".toCharArray()); 1035 | stopWordDictionary.add("至于".toCharArray()); 1036 | stopWordDictionary.add("诸位".toCharArray()); 1037 | stopWordDictionary.add("着".toCharArray()); 1038 | stopWordDictionary.add("着呢".toCharArray()); 1039 | stopWordDictionary.add("自".toCharArray()); 1040 | stopWordDictionary.add("自从".toCharArray()); 1041 | stopWordDictionary.add("自个儿".toCharArray()); 1042 | stopWordDictionary.add("自各儿".toCharArray()); 1043 | stopWordDictionary.add("自己".toCharArray()); 1044 | stopWordDictionary.add("自家".toCharArray()); 1045 | stopWordDictionary.add("自身".toCharArray()); 1046 | stopWordDictionary.add("综上所述".toCharArray()); 1047 | stopWordDictionary.add("总的来看".toCharArray()); 1048 | stopWordDictionary.add("总的来说".toCharArray()); 1049 | stopWordDictionary.add("总的说来".toCharArray()); 1050 | stopWordDictionary.add("总而言之".toCharArray()); 1051 | stopWordDictionary.add("总之".toCharArray()); 1052 | stopWordDictionary.add("纵".toCharArray()); 1053 | stopWordDictionary.add("纵令".toCharArray()); 1054 | stopWordDictionary.add("纵然".toCharArray()); 1055 | stopWordDictionary.add("纵使".toCharArray()); 1056 | stopWordDictionary.add("遵照".toCharArray()); 1057 | stopWordDictionary.add("作为".toCharArray()); 1058 | stopWordDictionary.add("兮".toCharArray()); 1059 | stopWordDictionary.add("呃".toCharArray()); 1060 | stopWordDictionary.add("呗".toCharArray()); 1061 | stopWordDictionary.add("咚".toCharArray()); 1062 | stopWordDictionary.add("咦".toCharArray()); 1063 | stopWordDictionary.add("喏".toCharArray()); 1064 | stopWordDictionary.add("啐".toCharArray()); 1065 | stopWordDictionary.add("喔唷".toCharArray()); 1066 | stopWordDictionary.add("嗬".toCharArray()); 1067 | stopWordDictionary.add("嗯".toCharArray()); 1068 | stopWordDictionary.add("嗳".toCharArray()); 1069 | } 1070 | 1071 | 1072 | /** 1073 | * 返回useSmart标志位 1074 | * isSmartMode =true ,分词器使用智能切分策略, =false则使用细粒度切分 1075 | * 1076 | * @return isSmartMode 1077 | */ 1078 | public boolean isSmartMode() { 1079 | return smartMode; 1080 | } 1081 | 1082 | /** 1083 | * 设置useSmart标志位 1084 | * isSmartMode =true ,分词器使用智能切分策略, =false则使用细粒度切分 1085 | * 1086 | * @param smartMode 1087 | */ 1088 | public void setSmartMode(boolean smartMode) { 1089 | this.smartMode = smartMode; 1090 | } 1091 | 1092 | @Override 1093 | public List getMainDictionary() { 1094 | return mainDictionary; 1095 | } 1096 | 1097 | @Override 1098 | public List getStopWordDictionary() { 1099 | return stopWordDictionary; 1100 | } 1101 | 1102 | @Override 1103 | public List getQuantifierDictionary() { 1104 | return quantifierDictionary; 1105 | } 1106 | 1107 | 1108 | } 1109 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/.gitignore: -------------------------------------------------------------------------------- 1 | ./data/ 2 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'distribution' 2 | 3 | ext { 4 | LUCENCE_VERSION = '4.10.4' 5 | ELASTICSEARCH_VERSION = '1.6.0' 6 | } 7 | 8 | group = "io.github.zacker330.es" 9 | archivesBaseName = "ik-analysis-es-plugin" 10 | version = "1.0.1" 11 | 12 | dependencies { 13 | 14 | compile project(':ik-analysis-core') 15 | 16 | compile("org.elasticsearch:elasticsearch:$ELASTICSEARCH_VERSION") 17 | compile("org.apache.lucene:lucene-core:$LUCENCE_VERSION") 18 | compile("org.apache.lucene:lucene-queryparser:$LUCENCE_VERSION") 19 | compile("org.apache.lucene:lucene-analyzers-common:$LUCENCE_VERSION") 20 | runtime('ch.qos.logback:logback-classic:1.1.3') 21 | 22 | testCompile("org.apache.lucene:lucene-test-framework:$LUCENCE_VERSION") { 23 | exclude module: 'randomizedtesting-runner' 24 | } 25 | 26 | testCompile('junit:junit:4.12') 27 | testCompile('org.hamcrest:hamcrest-all:1.3') 28 | testCompile("com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.1.16") 29 | 30 | testCompile group: 'org.elasticsearch', name: 'elasticsearch', version: ELASTICSEARCH_VERSION, classifier: 'tests' 31 | 32 | } 33 | 34 | 35 | modifyPom { 36 | project { 37 | name 'es-ik' 38 | description 'Kind of Chinese Analysis for Elasticsearch' 39 | url 'https://github.com/zacker330/es-ik' 40 | inceptionYear '2015' 41 | 42 | scm { 43 | url 'https://github.com/zacker330/es-ik' 44 | connection 'scm:https://github.com/zacker330/es-ik.git' 45 | developerConnection 'scm:git@github.com:zacker330/es-ik.git' 46 | } 47 | 48 | licenses { 49 | license { 50 | name 'The Apache Software License, Version 2.0' 51 | url 'http://www.apache.org/licenses/LICENSE-2.0.txt' 52 | distribution 'repo' 53 | } 54 | } 55 | 56 | developers { 57 | developer { 58 | id 'zacker330' 59 | name 'Jack' 60 | email 'zacker330@gmail.com' 61 | } 62 | } 63 | } 64 | } 65 | 66 | extraArchive { 67 | sources = true 68 | tests = true 69 | javadoc = true 70 | } 71 | 72 | 73 | 74 | distributions { 75 | main { 76 | baseName = 'ik-analysis-es-plugin' 77 | contents { 78 | from { "build/libs/" } 79 | from { "libs/" } 80 | from { project(":ik-analysis-core").buildDir.path + '/libs/' } 81 | from { project(":ik-analysis-es-plugin").buildDir.path + '/libs/' } 82 | } 83 | 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/IKAnalysisBinderProcessor.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis.ik; 2 | 3 | import org.elasticsearch.index.analysis.AnalysisModule; 4 | 5 | public class IKAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { 6 | @Override 7 | public void processAnalyzers(AnalyzersBindings analyzersBindings) { 8 | analyzersBindings.processAnalyzer("ik_analysis", IKAnalyzerProvider.class); 9 | } 10 | 11 | @Override 12 | public void processTokenizers(TokenizersBindings tokenizersBindings) { 13 | tokenizersBindings.processTokenizer("ik_tokenizer", IKTokenizerFactory.class); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/IKAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis.ik; 2 | 3 | import org.elasticsearch.common.inject.Inject; 4 | import org.elasticsearch.common.inject.assistedinject.Assisted; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.Index; 8 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; 9 | import org.elasticsearch.index.analysis.ik.spi.Configuration; 10 | import org.elasticsearch.index.settings.IndexSettings; 11 | import org.wltea.analyzer.lucene.IKAnalyzer; 12 | 13 | import java.util.Iterator; 14 | import java.util.ServiceLoader; 15 | 16 | public class IKAnalyzerProvider extends AbstractIndexAnalyzerProvider { 17 | private final IKAnalyzer analyzer; 18 | private ServiceLoader loader; 19 | 20 | @Inject 21 | public IKAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { 22 | super(index, indexSettings, name, settings); 23 | 24 | loader = ServiceLoader.load(Configuration.class); 25 | Iterator iterator = loader.iterator(); 26 | if (!iterator.hasNext()) { 27 | throw new NotFoundIKAnalyzerConfigurationImplementation(); 28 | } 29 | analyzer = new IKAnalyzer(iterator.next().init(index, indexSettings, env, name, settings)); 30 | } 31 | 32 | @Override 33 | public IKAnalyzer get() { 34 | return this.analyzer; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/IKTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis.ik; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.common.inject.Inject; 5 | import org.elasticsearch.common.inject.assistedinject.Assisted; 6 | import org.elasticsearch.common.logging.ESLogger; 7 | import org.elasticsearch.common.logging.ESLoggerFactory; 8 | import org.elasticsearch.common.settings.Settings; 9 | import org.elasticsearch.env.Environment; 10 | import org.elasticsearch.index.Index; 11 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory; 12 | import org.elasticsearch.index.analysis.ik.spi.Configuration; 13 | import org.elasticsearch.index.settings.IndexSettings; 14 | import org.wltea.analyzer.lucene.IKTokenizer; 15 | 16 | import java.io.Reader; 17 | import java.util.Iterator; 18 | import java.util.ServiceLoader; 19 | 20 | public class IKTokenizerFactory extends AbstractTokenizerFactory { 21 | private final ESLogger logger = ESLoggerFactory.getLogger(IKTokenizerFactory.class.getName()); 22 | 23 | private Configuration configuration; 24 | private ServiceLoader loader; 25 | 26 | 27 | @Inject 28 | public IKTokenizerFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { 29 | super(index, indexSettings, name, settings); 30 | loader = ServiceLoader.load(Configuration.class); 31 | Iterator iterator = loader.iterator(); 32 | if (!iterator.hasNext()) { 33 | logger.error("please provide the implementation of Configuration interface"); 34 | throw new NotFoundIKAnalyzerConfigurationImplementation(); 35 | } 36 | 37 | configuration = iterator.next(); 38 | configuration.init(index, indexSettings, env, name, settings); 39 | 40 | } 41 | 42 | @Override 43 | public Tokenizer create(Reader reader) { 44 | return new IKTokenizer(reader, configuration); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/NotFoundIKAnalyzerConfigurationImplementation.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis.ik; 2 | 3 | public class NotFoundIKAnalyzerConfigurationImplementation extends RuntimeException { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/spi/Configuration.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis.ik.spi; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.env.Environment; 5 | import org.elasticsearch.index.Index; 6 | import org.elasticsearch.index.settings.IndexSettings; 7 | import org.wltea.analyzer.configuration.DictionaryConfiguration; 8 | 9 | public interface Configuration extends DictionaryConfiguration { 10 | Configuration init(Index index, @IndexSettings Settings indexSettings, Environment env, String name, Settings settings); 11 | } 12 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/java/org/elasticsearch/plugin/analyzer/ik/AnalysisIKPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analyzer.ik; 2 | 3 | import org.elasticsearch.common.inject.Module; 4 | import org.elasticsearch.index.analysis.AnalysisModule; 5 | import org.elasticsearch.index.analysis.ik.IKAnalysisBinderProcessor; 6 | import org.elasticsearch.plugins.AbstractPlugin; 7 | 8 | public class AnalysisIKPlugin extends AbstractPlugin { 9 | @Override 10 | public String name() { 11 | return "ik_analysis"; 12 | } 13 | 14 | @Override 15 | public String description() { 16 | return "IK Chinese analysis support"; 17 | } 18 | 19 | @Override public void processModule(Module module) { 20 | if (module instanceof AnalysisModule) { 21 | AnalysisModule analysisModule = (AnalysisModule) module; 22 | analysisModule.addProcessor(new IKAnalysisBinderProcessor()); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.lucene; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.analysis.Tokenizer; 5 | import org.wltea.analyzer.configuration.DictionaryConfiguration; 6 | 7 | import java.io.Reader; 8 | 9 | public final class IKAnalyzer extends Analyzer { 10 | 11 | private DictionaryConfiguration configuration; 12 | 13 | public IKAnalyzer(DictionaryConfiguration configuration) { 14 | super(); 15 | this.configuration = configuration; 16 | } 17 | @Override 18 | protected TokenStreamComponents createComponents(String fieldName, final Reader in) { 19 | Tokenizer _IKTokenizer = new IKTokenizer(in, configuration); 20 | return new TokenStreamComponents(_IKTokenizer); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | 25 | * 26 | */ 27 | package org.wltea.analyzer.lucene; 28 | 29 | import org.apache.lucene.analysis.Tokenizer; 30 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 31 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 32 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 33 | import org.wltea.analyzer.configuration.DictionaryConfiguration; 34 | import org.wltea.analyzer.core.IKSegmenter; 35 | import org.wltea.analyzer.core.Lexeme; 36 | 37 | import java.io.IOException; 38 | import java.io.Reader; 39 | 40 | 41 | /** 42 | * IK分词器 Lucene Tokenizer适配器类 43 | * 兼容Lucene 4.0版本 44 | */ 45 | public final class IKTokenizer extends Tokenizer { 46 | 47 | //IK分词器实现 48 | private IKSegmenter _IKImplement; 49 | 50 | //词元文本属性 51 | private final CharTermAttribute termAtt; 52 | //词元位移属性 53 | private final OffsetAttribute offsetAtt; 54 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) 55 | private final TypeAttribute typeAtt; 56 | //记录最后一个词元的结束位置 57 | private int endPosition; 58 | 59 | public IKTokenizer(Reader in, DictionaryConfiguration configuration) { 60 | super(in); 61 | offsetAtt = addAttribute(OffsetAttribute.class); 62 | termAtt = addAttribute(CharTermAttribute.class); 63 | typeAtt = addAttribute(TypeAttribute.class); 64 | _IKImplement = new IKSegmenter(input, configuration); 65 | } 66 | 67 | /* (non-Javadoc) 68 | * @see org.apache.lucene.analysis.TokenStream#incrementToken() 69 | */ 70 | @Override 71 | public boolean incrementToken() throws IOException { 72 | //清除所有的词元属性 73 | clearAttributes(); 74 | Lexeme nextLexeme = _IKImplement.next(); 75 | if (nextLexeme != null) { 76 | //将Lexeme转成Attributes 77 | //设置词元文本 78 | termAtt.append(nextLexeme.getLexemeText()); 79 | //设置词元长度 80 | termAtt.setLength(nextLexeme.getLength()); 81 | //设置词元位移 82 | offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); 83 | //记录分词的最后位置 84 | endPosition = nextLexeme.getEndPosition(); 85 | //记录词元分类 86 | typeAtt.setType(nextLexeme.getLexemeTypeString()); 87 | //返会true告知还有下个词元 88 | return true; 89 | } 90 | //返会false告知词元输出完毕 91 | return false; 92 | } 93 | 94 | /* 95 | * (non-Javadoc) 96 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) 97 | */ 98 | @Override 99 | public void reset() throws IOException { 100 | super.reset(); 101 | _IKImplement.reset(input); 102 | } 103 | 104 | public final void end() { 105 | // set final offset 106 | int finalOffset = correctOffset(this.endPosition); 107 | offsetAtt.setOffset(finalOffset, finalOffset); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/main/resources/es-plugin.properties: -------------------------------------------------------------------------------- 1 | plugin=org.elasticsearch.plugin.analyzer.ik.AnalysisIKPlugin 2 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/test/java/IkESPluginTest.java: -------------------------------------------------------------------------------- 1 | import org.elasticsearch.Version; 2 | import org.elasticsearch.cluster.metadata.IndexMetaData; 3 | import org.elasticsearch.common.inject.Injector; 4 | import org.elasticsearch.common.inject.ModulesBuilder; 5 | import org.elasticsearch.common.settings.ImmutableSettings; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.common.settings.SettingsModule; 8 | import org.elasticsearch.env.Environment; 9 | import org.elasticsearch.env.EnvironmentModule; 10 | import org.elasticsearch.index.Index; 11 | import org.elasticsearch.index.IndexNameModule; 12 | import org.elasticsearch.index.analysis.AnalysisModule; 13 | import org.elasticsearch.index.analysis.AnalysisService; 14 | import org.elasticsearch.index.analysis.TokenizerFactory; 15 | import org.elasticsearch.index.analysis.ik.IKAnalysisBinderProcessor; 16 | import org.elasticsearch.index.analysis.ik.IKTokenizerFactory; 17 | import org.elasticsearch.index.settings.IndexSettingsModule; 18 | import org.elasticsearch.indices.analysis.IndicesAnalysisModule; 19 | import org.elasticsearch.indices.analysis.IndicesAnalysisService; 20 | import org.elasticsearch.test.ElasticsearchTestCase; 21 | import org.hamcrest.MatcherAssert; 22 | import org.junit.Test; 23 | 24 | import static org.hamcrest.Matchers.instanceOf; 25 | 26 | 27 | public class IkESPluginTest extends ElasticsearchTestCase { 28 | 29 | 30 | @Test 31 | public void testDefaultsIcuAnalysis() { 32 | Index index = new Index("test"); 33 | 34 | Settings settings = ImmutableSettings.settingsBuilder() 35 | .put("path.home", "none") 36 | .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) 37 | .build(); 38 | 39 | Injector parentInjector = new ModulesBuilder().add(new SettingsModule(ImmutableSettings.EMPTY), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); 40 | Injector injector = new ModulesBuilder().add( 41 | new IndexSettingsModule(index, settings), 42 | new IndexNameModule(index), 43 | new AnalysisModule(ImmutableSettings.EMPTY, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IKAnalysisBinderProcessor())) 44 | .createChildInjector(parentInjector); 45 | 46 | AnalysisService analysisService = injector.getInstance(AnalysisService.class); 47 | 48 | TokenizerFactory tokenizerFactory = analysisService.tokenizer("ik_tokenizer"); 49 | MatcherAssert.assertThat(tokenizerFactory, instanceOf(IKTokenizerFactory.class)); 50 | 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/test/java/org/elasticsearch/index/analysis/ik/MockConfiguration.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis.ik; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.env.Environment; 5 | import org.elasticsearch.index.Index; 6 | import org.elasticsearch.index.analysis.ik.spi.Configuration; 7 | import org.elasticsearch.index.settings.IndexSettings; 8 | 9 | import java.util.Collections; 10 | import java.util.List; 11 | 12 | public class MockConfiguration implements Configuration{ 13 | 14 | 15 | @Override 16 | public Configuration init(Index index, @IndexSettings Settings indexSettings, Environment env, String name, Settings settings) { 17 | 18 | return this; 19 | } 20 | 21 | @Override 22 | public boolean isSmartMode() { 23 | return false; 24 | } 25 | 26 | @Override 27 | public void setSmartMode(boolean useSmart) { 28 | 29 | } 30 | 31 | @Override 32 | public List getMainDictionary() { 33 | return Collections.emptyList(); 34 | } 35 | 36 | @Override 37 | public List getStopWordDictionary() { 38 | return Collections.emptyList(); 39 | } 40 | 41 | @Override 42 | public List getQuantifierDictionary() { 43 | return Collections.emptyList(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /ik-analysis-es-plugin/src/test/resources/META-INF/services/org.elasticsearch.index.analysis.ik.spi.Configuration: -------------------------------------------------------------------------------- 1 | org.elasticsearch.index.analysis.ik.MockConfiguration 2 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | include 'ik-analysis-core' 2 | 3 | include 'ik-analysis-es-plugin' 4 | 5 | include 'es-ik-sqlite3' 6 | 7 | --------------------------------------------------------------------------------