├── .gitignore
├── .travis.yml
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── build.gradle
├── es-ik-sqlite3
    ├── build.gradle
    ├── libs
    │   └── sqlite-jdbc-3.8.10.1.jar
    └── src
    │   ├── main
    │       ├── java
    │       │   └── io
    │       │   │   └── github
    │       │   │       └── zacker330
    │       │   │           └── es
    │       │   │               └── ik
    │       │   │                   └── es
    │       │   │                       └── ik
    │       │   │                           └── analyzer
    │       │   │                               └── Sqlite3Configuration.java
    │       └── resources
    │       │   └── META-INF
    │       │       └── services
    │       │           └── org.elasticsearch.index.analysis.ik.spi.Configuration
    │   └── test
    │       ├── java
    │           ├── io
    │           │   └── github
    │           │   │   └── zacker330
    │           │   │       └── es
    │           │   │           └── ik
    │           │   │               ├── AbstractIntegrationTest.java
    │           │   │               └── DictionaryDatasource.java
    │           └── org
    │           │   └── wltea
    │           │       └── analyzer
    │           │           ├── IKAnalzyerTest.java
    │           │           └── LuceneIndexAndSearchTest.java
    │       └── resources
    │           ├── database.sql
    │           ├── logback-test.xml
    │           ├── mainDic.properties
    │           ├── quantifierDic.properties
    │           └── stopwordDic.properties
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── ik-analysis-core
    ├── build.gradle
    ├── config
    │   └── checkstyle
    │   │   └── checkstyle.xml
    └── src
    │   ├── main
    │       └── java
    │       │   └── org
    │       │       └── wltea
    │       │           └── analyzer
    │       │               ├── configuration
    │       │                   └── DictionaryConfiguration.java
    │       │               ├── core
    │       │                   ├── AnalyzeContext.java
    │       │                   ├── CJKSegmenter.java
    │       │                   ├── CN_QuantifierSegmenter.java
    │       │                   ├── CharacterUtil.java
    │       │                   ├── IKArbitrator.java
    │       │                   ├── IKSegmenter.java
    │       │                   ├── ISegmenter.java
    │       │                   ├── LetterSegmenter.java
    │       │                   ├── Lexeme.java
    │       │                   ├── LexemePath.java
    │       │                   └── QuickSortSet.java
    │       │               └── dic
    │       │                   ├── DictSegment.java
    │       │                   ├── Dictionary.java
    │       │                   └── Hit.java
    │   └── test
    │       └── java
    │           └── org
    │               └── wltea
    │                   └── analyzer
    │                       ├── IKSegmenterTest.java
    │                       └── MockDictionary.java
├── ik-analysis-es-plugin
    ├── .gitignore
    ├── build.gradle
    └── src
    │   ├── main
    │       ├── java
    │       │   └── org
    │       │   │   ├── elasticsearch
    │       │   │       ├── index
    │       │   │       │   └── analysis
    │       │   │       │   │   └── ik
    │       │   │       │   │       ├── IKAnalysisBinderProcessor.java
    │       │   │       │   │       ├── IKAnalyzerProvider.java
    │       │   │       │   │       ├── IKTokenizerFactory.java
    │       │   │       │   │       ├── NotFoundIKAnalyzerConfigurationImplementation.java
    │       │   │       │   │       └── spi
    │       │   │       │   │           └── Configuration.java
    │       │   │       └── plugin
    │       │   │       │   └── analyzer
    │       │   │       │       └── ik
    │       │   │       │           └── AnalysisIKPlugin.java
    │       │   │   └── wltea
    │       │   │       └── analyzer
    │       │   │           └── lucene
    │       │   │               ├── IKAnalyzer.java
    │       │   │               └── IKTokenizer.java
    │       └── resources
    │       │   └── es-plugin.properties
    │   └── test
    │       ├── java
    │           ├── IkESPluginTest.java
    │           └── org
    │           │   └── elasticsearch
    │           │       └── index
    │           │           └── analysis
    │           │               └── ik
    │           │                   └── MockConfiguration.java
    │       └── resources
    │           └── META-INF
    │               └── services
    │                   └── org.elasticsearch.index.analysis.ik.spi.Configuration
└── settings.gradle


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | target/
3 | *.iws
4 | *.ipr
5 | *.iml
6 | build/
7 | .gradle/*
8 | buildSrc/.gradle/
9 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 | jdk:
3 |    - oraclejdk8
4 |    - oraclejdk7
5 | 
6 | notifications:
7 |   email: true
8 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 
204 | 
205 | 
206 | Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was
207 | derived from unicode conversion examples available at
208 | http://www.unicode.org/Public/PROGRAMS/CVTUTF.  Here is the copyright
209 | from those sources:
210 | 
211 | /*
212 |  * Copyright 2001-2004 Unicode, Inc.
213 |  * 
214 |  * Disclaimer
215 |  * 
216 |  * This source code is provided as is by Unicode, Inc. No claims are
217 |  * made as to fitness for any particular purpose. No warranties of any
218 |  * kind are expressed or implied. The recipient agrees to determine
219 |  * applicability of information provided. If this file has been
220 |  * purchased on magnetic or optical media from Unicode, Inc., the
221 |  * sole remedy for any claim will be exchange of defective media
222 |  * within 90 days of receipt.
223 |  * 
224 |  * Limitations on Rights to Redistribute This Code
225 |  * 
226 |  * Unicode, Inc. hereby grants the right to freely use the information
227 |  * supplied in this file in the creation of products supporting the
228 |  * Unicode Standard, and to make copies of this file in any form
229 |  * for internal or external distribution as long as this notice
230 |  * remains attached.
231 |  */
232 | 
233 | 
234 | Some code in src/java/org/apache/lucene/util/ArrayUtil.java was
235 | derived from Python 2.4.2 sources available at
236 | http://www.python.org. Full license is here:
237 | 
238 |   http://www.python.org/download/releases/2.4.2/license/
239 | 
240 | 
241 | Some code in src/java/org/apache/lucene/util/UnicodeUtil.java was
242 | derived from ICU (http://www.icu-project.org)
243 | The full license is available here: 
244 |   http://source.icu-project.org/repos/icu/icu/trunk/license.html
245 | 
246 | /*
247 |  * Copyright (C) 1999-2010, International Business Machines
248 |  * Corporation and others.  All Rights Reserved.
249 |  *
250 |  * Permission is hereby granted, free of charge, to any person obtaining a copy 
251 |  * of this software and associated documentation files (the "Software"), to deal
252 |  * in the Software without restriction, including without limitation the rights 
253 |  * to use, copy, modify, merge, publish, distribute, and/or sell copies of the 
254 |  * Software, and to permit persons to whom the Software is furnished to do so, 
255 |  * provided that the above copyright notice(s) and this permission notice appear 
256 |  * in all copies of the Software and that both the above copyright notice(s) and
257 |  * this permission notice appear in supporting documentation.
258 |  * 
259 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
260 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
261 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. 
262 |  * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE 
263 |  * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR 
264 |  * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 
265 |  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 
266 |  * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
267 |  *
268 |  * Except as contained in this notice, the name of a copyright holder shall not 
269 |  * be used in advertising or otherwise to promote the sale, use or other 
270 |  * dealings in this Software without prior written authorization of the 
271 |  * copyright holder.
272 |  */
273 | 
274 | The following license applies to the Snowball stemmers:
275 | 
276 | Copyright (c) 2001, Dr Martin Porter
277 | Copyright (c) 2002, Richard Boulton
278 | All rights reserved.
279 | 
280 | Redistribution and use in source and binary forms, with or without
281 | modification, are permitted provided that the following conditions are met:
282 | 
283 |     * Redistributions of source code must retain the above copyright notice,
284 |     * this list of conditions and the following disclaimer.
285 |     * Redistributions in binary form must reproduce the above copyright
286 |     * notice, this list of conditions and the following disclaimer in the
287 |     * documentation and/or other materials provided with the distribution.
288 |     * Neither the name of the copyright holders nor the names of its contributors
289 |     * may be used to endorse or promote products derived from this software
290 |     * without specific prior written permission.
291 | 
292 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
293 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
294 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
295 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
296 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
297 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
298 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
299 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
300 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
301 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
302 | 
303 | The following license applies to the KStemmer:
304 | 
305 | Copyright © 2003,
306 | Center for Intelligent Information Retrieval,
307 | University of Massachusetts, Amherst.
308 | All rights reserved.
309 | 
310 | Redistribution and use in source and binary forms, with or without modification,
311 | are permitted provided that the following conditions are met:
312 | 
313 | 1. Redistributions of source code must retain the above copyright notice, this
314 | list of conditions and the following disclaimer.
315 | 
316 | 2. Redistributions in binary form must reproduce the above copyright notice,
317 | this list of conditions and the following disclaimer in the documentation
318 | and/or other materials provided with the distribution.
319 | 
320 | 3. The names "Center for Intelligent Information Retrieval" and
321 | "University of Massachusetts" must not be used to endorse or promote products
322 | derived from this software without prior written permission. To obtain
323 | permission, contact info@ciir.cs.umass.edu.
324 | 
325 | THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
326 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
327 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
328 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
329 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
330 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
331 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
332 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
333 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
334 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
335 | SUCH DAMAGE.
336 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | Apache Lucene
 2 | Copyright 2011 The Apache Software Foundation
 3 | 
 4 | This product includes software developed by
 5 | The Apache Software Foundation (http://www.apache.org/).
 6 | 
 7 | 
 8 | The IKAnalyzer 2012 source code (under org/wltea) was
 9 | provided by Linliangyi and copyright 2012 by Oolong studio
10 | 
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Kind of Chinese Analysis for Elasticsearch [![Build Status](https://travis-ci.org/zacker330/es-ik.svg?branch=master)](https://travis-ci.org/zacker330/es-ik)
  2 | 
  3 | # Requirements
  4 | 
  5 |     - Java 7 update 55 or later
  6 | 
  7 | # Structure of es-ik
  8 | 
  9 | *  ik-analysis-core
 10 | 
 11 |     The algorithm of this module is coming from [ik-analyzer](https://code.google.com/p/ik-analyzer/). In principle, you can use this module to implement a Solor analyzer plugin or a Elasticsearch plugin.
 12 | 
 13 |     You just need implement `DictionaryConfiguration` interface to provide dictionary content which is used by analysing content process.
 14 | 
 15 | *  ik-analysis-es-plugin:
 16 | 
 17 |     Integrate with ik-analyzer-core module and Elasticsearch. Define a kind of [SPI](https://en.wikipedia.org/wiki/Service_provider_interface) which is `Configuration` extends `DictionaryConfiguration`
 18 | 
 19 | *  es-ik-sqlite3
 20 | 
 21 |     Persist dictionary's content into Sqlite3 database. This module is a kind of `service provider` to SPI Configuration defined in ik-analysis-es-plugin.
 22 | 
 23 | 
 24 | # How to use es-ik
 25 | 
 26 | Actually, ik-analysis-es-plugin expose a interface `DictionaryConfiguration` a kind of SPI. es-ik-sqlite3 implement it so that ik-analysis-es-plugin can get dictionary's content from Sqlite. In other words, you can get your implementation like persisting dictionary's content into Redis.
 27 | 
 28 | SPI is just a kind of concept. In java, I use [ServiceLoader](https://docs.oracle.com/javase/6/docs/api/java/util/ServiceLoader.html) to implement that. As soon as your implementation conforms with ServiceLoader's usage, don't need to change ik-analysis-es-plugin module, you'll get a new ik-analysis-es-plugin's plugin. :P
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # How to use es-ik-sqlite3(currently version 1.0.1)
 34 | 
 35 | 
 36 | - tell elasticsearch where is you sqlite3 db, add a configuration into your elasticsearch.yml, like:
 37 | 
 38 |         ik_analysis_db_path: /opt/ik/dictionary.db
 39 | 
 40 |    PS: you can download my dictionary.db from https://github.com/zacker330/es-ik-sqlite3-dictionary
 41 | 
 42 | 
 43 | - get in you elasticsearch folder then install plugin:
 44 | 
 45 |         ./bin/plugin -i ik-analysis -u https://github.com/zacker330/es-ik-plugin-sqlite3-release/raw/master/es-ik-sqlite3-1.0.1.zip
 46 | 
 47 | - test your configuration:
 48 | 
 49 | 1. create songs index
 50 | 
 51 |         curl -X PUT -H "Cache-Control: no-cache" -d '{
 52 |             "settings":{
 53 |                 "index":{
 54 |                     "number_of_shards":1,
 55 |                     "number_of_replicas": 1
 56 |                 }
 57 |             }
 58 |         }' 'http://localhost:9200/songs/'
 59 | 
 60 | 2. create map for songs/song
 61 | 
 62 |         curl -X PUT -H "Cache-Control: no-cache" -d '{
 63 |                 "song": {
 64 |                     "_source": {"enabled": true},
 65 |                     "_all": {
 66 |                         "indexAnalyzer": "ik_analysis",
 67 |                         "searchAnalyzer": "ik_analysis",
 68 |                         "term_vector": "no",
 69 |                         "store": "true"
 70 |                     },
 71 |                     "properties":{
 72 |                         "title":{
 73 |                             "type": "string",
 74 |                             "store": "yes",
 75 |                             "indexAnalyzer": "ik_analysis",
 76 |                             "searchAnalyzer": "ik_analysis",
 77 |                             "include_in_all": "true"
 78 |                         }
 79 |                     }
 80 | 
 81 |                 }
 82 |         }
 83 |             ' 'http://localhost:9200/songs/_mapping/song'
 84 | 
 85 | 3. test it
 86 | 
 87 |         curl -X POST  -d '林夕为我们作词' 'http://localhost:9200/songs/_analyze?analyzer=ik_analysis'
 88 | 
 89 |         response:
 90 |         {"tokens":[{"token":"林夕","start_offset":0,"end_offset":2,"type":"CN_WORD","position":1},{"token":"作词","start_offset":5,"end_offset":7,"type":"CN_WORD","position":2}]}
 91 | 
 92 | # Create a empty sqlite3 db for es-ik-sqlite3
 93 | 
 94 | 1. create database
 95 | 
 96 |         sqlite3 dictionary.db
 97 | 
 98 | 2. create tables
 99 | 
100 |         CREATE TABLE main_dictionary(term TEXT NOT NULL,unique(term));
101 |         CREATE TABLE quantifier_dictionary(term TEXT NOT NULL,unique(term));
102 |         CREATE TABLE stopword_dictionary(term TEXT NOT NULL,unique(term));
103 | 
104 | 
105 | 617052 records ~= 30MB db file
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
 1 | buildscript {
 2 |     repositories {
 3 |         jcenter()
 4 |     }
 5 |     dependencies {
 6 |         classpath 'com.bmuschko:gradle-nexus-plugin:2.3.1'
 7 |     }
 8 | }
 9 | 
10 | //apply plugin: 'checkstyle'
11 | 
12 | allprojects {
13 |     apply plugin: 'idea'
14 |     apply plugin: 'com.bmuschko.nexus'
15 | }
16 | 
17 | subprojects {
18 |     apply plugin: 'java'
19 |     apply plugin: 'distribution'
20 | 
21 | 
22 |     sourceCompatibility = 1.7
23 |     version = '1.0'
24 | 
25 |     repositories {
26 |         mavenCentral()
27 |     }
28 | 
29 |     distZip {
30 |         exclude("**/*-javadoc.jar")
31 |         exclude("**/*-tests.jar")
32 |         exclude("**/*-sources.jar")
33 |     }
34 | 
35 | 
36 |     test {
37 |         // enable TestNG support (default is JUnit)
38 | 
39 |         // show standard out and standard error of the test JVM(s) on the console
40 |         testLogging.showStandardStreams = true
41 | 
42 |         // set heap size for the test JVM(s)
43 |         minHeapSize = "128m"
44 |         maxHeapSize = "1024m"
45 | 
46 |         // set JVM arguments for the test JVM(s)
47 |         jvmArgs '-XX:MaxPermSize=256m'
48 | 
49 |         // listen to events in the test execution lifecycle
50 |         beforeTest { descriptor ->
51 |             logger.lifecycle("Running test: " + descriptor)
52 |         }
53 | 
54 |         // listen to standard out and standard error of the test JVM(s)
55 | //        onOutput { descriptor, event ->
56 | //            logger.lifecycle("Test: " + descriptor + " produced standard out/err: " + event.message )
57 | //        }
58 |     }
59 | }
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/build.gradle:
--------------------------------------------------------------------------------
 1 | group = "io.github.zacker330.es"
 2 | archivesBaseName = "es-ik-sqlite3"
 3 | version = "1.0.1"
 4 | 
 5 | ext {
 6 |     LUCENCE_VERSION = '4.10.4'
 7 |     ELASTICSEARCH_VERSION = '1.6.0'
 8 | }
 9 | 
10 | dependencies {
11 | 
12 |     compile project(':ik-analysis-core')
13 |     compile project(':ik-analysis-es-plugin')
14 |     compile(
15 |             "org.elasticsearch:elasticsearch:$ELASTICSEARCH_VERSION",
16 |             "org.apache.lucene:lucene-core:$LUCENCE_VERSION",
17 |             "org.apache.lucene:lucene-queryparser:$LUCENCE_VERSION",
18 |             "org.apache.lucene:lucene-analyzers-common:$LUCENCE_VERSION",
19 |             files('libs/sqlite-jdbc-3.8.10.1.jar')
20 |     )
21 |     runtime('ch.qos.logback:logback-classic:1.1.3')
22 | 
23 |     testCompile('com.google.guava:guava:18.0')
24 |     testCompile('commons-dbutils:commons-dbutils:1.6')
25 |     testCompile('junit:junit:4.12',
26 |             "com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.1.14",
27 |             "org.apache.lucene:lucene-test-framework:$LUCENCE_VERSION"
28 |     )
29 |     testCompile project(':ik-analysis-es-plugin')
30 | 
31 | }
32 | 
33 | modifyPom {
34 |     project {
35 |         name 'es-ik'
36 |         description 'Kind of Chinese Analysis for Elasticsearch'
37 |         url 'https://github.com/zacker330/es-ik'
38 |         inceptionYear '2015'
39 | 
40 |         scm {
41 |             url 'https://github.com/zacker330/es-ik'
42 |             connection 'scm:https://github.com/zacker330/es-ik.git'
43 |             developerConnection 'scm:git@github.com:zacker330/es-ik.git'
44 |         }
45 | 
46 |         licenses {
47 |             license {
48 |                 name 'The Apache Software License, Version 2.0'
49 |                 url 'http://www.apache.org/licenses/LICENSE-2.0.txt'
50 |                 distribution 'repo'
51 |             }
52 |         }
53 | 
54 |         developers {
55 |             developer {
56 |                 id 'zacker330'
57 |                 name 'Jack'
58 |                 email 'zacker330@gmail.com'
59 |             }
60 |         }
61 |     }
62 | }
63 | 
64 | extraArchive {
65 |     sources = true
66 |     tests = true
67 |     javadoc = true
68 | }
69 | 
70 | distributions {
71 |     main {
72 |         baseName = 'es-ik-sqlite3'
73 |         contents {
74 |             from { "build/libs/" }
75 |             from { "libs/" }
76 |             from { project(":ik-analysis-core").buildDir.path + '/libs/' }
77 |             from { project(":ik-analysis-es-plugin").buildDir.path + '/libs/' }
78 |         }
79 |     }
80 | }
81 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/libs/sqlite-jdbc-3.8.10.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacker330/es-ik/4fc353df3a1b1d891a5501a48c2e23a96e042383/es-ik-sqlite3/libs/sqlite-jdbc-3.8.10.1.jar


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/main/java/io/github/zacker330/es/ik/es/ik/analyzer/Sqlite3Configuration.java:
--------------------------------------------------------------------------------
  1 | package io.github.zacker330.es.ik.es.ik.analyzer;
  2 | 
  3 | import org.elasticsearch.common.logging.ESLogger;
  4 | import org.elasticsearch.common.logging.ESLoggerFactory;
  5 | import org.elasticsearch.common.settings.Settings;
  6 | import org.elasticsearch.env.Environment;
  7 | import org.elasticsearch.index.Index;
  8 | import org.elasticsearch.index.analysis.ik.spi.Configuration;
  9 | import org.elasticsearch.index.settings.IndexSettings;
 10 | 
 11 | import java.sql.*;
 12 | import java.util.ArrayList;
 13 | import java.util.List;
 14 | 
 15 | public class Sqlite3Configuration implements Configuration {
 16 | 
 17 |     private final ESLogger logger = ESLoggerFactory.getLogger(Sqlite3Configuration.class.getName());
 18 | 
 19 |     private List<char[]> mainDictionary;
 20 |     private List<char[]> quantifierDictionary;
 21 |     private List<char[]> stopWordDictionary;
 22 | 
 23 | 
 24 |     private boolean smartMode = true;
 25 | 
 26 |     public Sqlite3Configuration() {
 27 |     }
 28 | 
 29 |     private Sqlite3Configuration(String dbPath) {
 30 |         if (dbPath == null || "".equals(dbPath.trim())) {
 31 |             logger.error("dbPath is required!");
 32 |             throw new IllegalArgumentException();
 33 |         }
 34 | 
 35 | 
 36 |         mainDictionary = new ArrayList<char[]>();
 37 |         quantifierDictionary = new ArrayList<char[]>();
 38 |         stopWordDictionary = new ArrayList<char[]>();
 39 |         Connection connection = null;
 40 |         Statement statement = null;
 41 | 
 42 |         try {
 43 |             Class.forName("org.sqlite.JDBC");
 44 |             connection = DriverManager.getConnection("jdbc:sqlite:" + dbPath);
 45 |             statement = connection.createStatement();
 46 |             statement.setQueryTimeout(30);
 47 |             ResultSet mainResult = statement.executeQuery("select * from main_dictionary");
 48 |             while (mainResult.next()) {
 49 |                 String term = mainResult.getString("term");
 50 |                 if (term == null || "".equals(term.trim())) {
 51 |                     continue;
 52 |                 }
 53 |                 mainDictionary.add(term.toCharArray());
 54 |             }
 55 | 
 56 |             ResultSet stopWordResult = statement.executeQuery("select * from stopword_dictionary");
 57 |             while (stopWordResult.next()) {
 58 |                 String term = stopWordResult.getString("term");
 59 |                 if (term == null || "".equals(term.trim())) {
 60 |                     continue;
 61 |                 }
 62 |                 stopWordDictionary.add(term.toCharArray());
 63 |             }
 64 | 
 65 |             ResultSet quantifierResult = statement.executeQuery("select * from quantifier_dictionary");
 66 |             while (quantifierResult.next()) {
 67 |                 String term = quantifierResult.getString("term");
 68 |                 if (term == null || "".equals(term.trim())) {
 69 |                     continue;
 70 |                 }
 71 |                 quantifierDictionary.add(term.toCharArray());
 72 |             }
 73 | 
 74 |         } catch (SQLException e) {
 75 |             logger.error("there's sql error", e);
 76 |             throw new RuntimeException(e);
 77 |         } catch (ClassNotFoundException e) {
 78 |             logger.error("not found sqlite3 jdbc", e);
 79 |             throw new RuntimeException(e);
 80 |         } finally {
 81 |             try {
 82 |                 if (statement != null) {
 83 |                     statement.close();
 84 |                     statement = null;
 85 |                 }
 86 |                 if (connection != null) {
 87 |                     connection.close();
 88 |                     connection = null;
 89 |                 }
 90 |             } catch (SQLException e) {
 91 |                 logger.error("can't close jdbc connection", e);
 92 |                 throw new RuntimeException(e);
 93 |             }
 94 |         }
 95 |     }
 96 | 
 97 |     public static Sqlite3Configuration smartModeSqlite3Configure(String dbPath) {
 98 |         Sqlite3Configuration sqlite3Configure = new Sqlite3Configuration(dbPath);
 99 |         sqlite3Configure.setSmartMode(true);
100 |         return sqlite3Configure;
101 |     }
102 | 
103 | 
104 |     /**
105 |      * 返回useSmart标志位
106 |      * isSmartMode =true ，分词器使用智能切分策略， =false则使用细粒度切分
107 |      *
108 |      * @return isSmartMode
109 |      */
110 |     public boolean isSmartMode() {
111 |         return smartMode;
112 |     }
113 | 
114 |     /**
115 |      * 设置useSmart标志位
116 |      * isSmartMode =true ，分词器使用智能切分策略， =false则使用细粒度切分
117 |      *
118 |      * @param smartMode
119 |      */
120 |     public void setSmartMode(boolean smartMode) {
121 |         this.smartMode = smartMode;
122 |     }
123 | 
124 |     @Override
125 |     public List<char[]> getMainDictionary() {
126 |         return mainDictionary;
127 |     }
128 | 
129 |     @Override
130 |     public List<char[]> getStopWordDictionary() {
131 |         return stopWordDictionary;
132 |     }
133 | 
134 |     @Override
135 |     public List<char[]> getQuantifierDictionary() {
136 |         return quantifierDictionary;
137 |     }
138 | 
139 | 
140 |     @Override
141 |     public Configuration init(Index index, @IndexSettings Settings indexSettings, Environment env, String name, Settings settings) {
142 |         return Sqlite3Configuration.smartModeSqlite3Configure(env.settings().get("ik_analysis_db_path"));
143 |     }
144 | }
145 | 
146 | 
147 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/main/resources/META-INF/services/org.elasticsearch.index.analysis.ik.spi.Configuration:
--------------------------------------------------------------------------------
1 | io.github.zacker330.es.ik.es.ik.analyzer.Sqlite3Configuration
2 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/java/io/github/zacker330/es/ik/AbstractIntegrationTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.zacker330.es.ik;
 2 | 
 3 | import com.google.common.base.Function;
 4 | import org.apache.commons.dbutils.QueryRunner;
 5 | import org.junit.AfterClass;
 6 | import org.junit.Assert;
 7 | import org.junit.BeforeClass;
 8 | import org.wltea.analyzer.IKAnalzyerTest;
 9 | 
10 | import java.io.*;
11 | import java.sql.SQLException;
12 | 
13 | public abstract class AbstractIntegrationTest {
14 | 
15 |     public final static String dbPath = AbstractIntegrationTest.class.getResource(".") + "dictionary.db";
16 | 
17 |     @BeforeClass
18 |     public static void prepareDatabase() throws IOException {
19 | 
20 | 
21 |         if (new File(dbPath).exists()) {
22 |             Assert.assertTrue(new File(dbPath).delete());
23 |         }
24 | 
25 |         Assert.assertTrue(runSQL(dbPath, "CREATE TABLE IF NOT EXISTS main_dictionary(term TEXT NOT NULL,unique(term));"));
26 |         Assert.assertTrue(runSQL(dbPath, "CREATE TABLE IF NOT EXISTS stopword_dictionary(term TEXT NOT NULL,unique(term));"));
27 |         Assert.assertTrue(runSQL(dbPath, "CREATE TABLE IF NOT EXISTS quantifier_dictionary(term TEXT NOT NULL,unique(term));"));
28 | //
29 |         insertTerm("INSERT OR IGNORE INTO quantifier_dictionary values(?);", new IKAnalzyerTest().getClass().getClassLoader().getResourceAsStream("./quantifierDic.properties"));
30 |         insertTerm("INSERT OR IGNORE INTO stopword_dictionary values(?);", new IKAnalzyerTest().getClass().getClassLoader().getResourceAsStream("./stopwordDic.properties"));
31 |         insertTerm("INSERT OR IGNORE INTO main_dictionary values(?);", new IKAnalzyerTest().getClass().getClassLoader().getResourceAsStream("./mainDic.properties"));
32 |     }
33 | 
34 |     @AfterClass
35 |     public static void cleanDatabase() {
36 |         if (new File(dbPath).exists()) {
37 |             Assert.assertTrue(new File(dbPath).delete());
38 |         }
39 |     }
40 | 
41 |     private static void insertTerm(String sql, InputStream dataLineByLineInputStream) throws IOException {
42 |         readAndProcessTextInLine(dataLineByLineInputStream, new AbstractIntegrationTest.SQLRunFunction(dbPath, sql));
43 |     }
44 | 
45 |     private static void readAndProcessTextInLine(InputStream inputStream, Function<String, Boolean> function) throws IOException {
46 |         BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"), 512);
47 | 
48 |         String line = null;
49 |         do {
50 |             line = bufferedReader.readLine();
51 |             if (line != null && !"".equals(line.trim())) {
52 |                 if (!function.apply(line.trim().toLowerCase())) {
53 |                     break;
54 |                 }
55 |             }
56 |         } while (line != null);
57 |     }
58 | 
59 | 
60 |     private static class SQLRunFunction implements Function<String, Boolean> {
61 | 
62 |         private String dbPath;
63 |         private String sql;
64 | 
65 |         public SQLRunFunction(String dbPath, String sql) {
66 |             this.dbPath = dbPath;
67 |             this.sql = sql;
68 |         }
69 | 
70 |         @Override
71 |         public Boolean apply(String arg) {
72 |             return runSQL(dbPath, sql, arg);
73 |         }
74 |     }
75 | 
76 |     private static boolean runSQL(String dbPath, String sql, Object... args) {
77 |         QueryRunner queryRunner = new QueryRunner(new DictionaryDataSource(dbPath));
78 |         try {
79 |             System.out.println("SQL: " + sql);
80 |             int result = queryRunner.update(sql, args);
81 |         } catch (SQLException e) {
82 |             System.out.println(e);
83 |             return false;
84 |         }
85 |         return true;
86 |     }
87 | 
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/java/io/github/zacker330/es/ik/DictionaryDatasource.java:
--------------------------------------------------------------------------------
 1 | package io.github.zacker330.es.ik;
 2 | 
 3 | import javax.sql.DataSource;
 4 | import java.io.PrintWriter;
 5 | import java.sql.Connection;
 6 | import java.sql.DriverManager;
 7 | import java.sql.SQLException;
 8 | import java.sql.SQLFeatureNotSupportedException;
 9 | import java.util.logging.Logger;
10 | 
11 | public class DictionaryDataSource implements DataSource {
12 | 
13 |     private String dbPath;
14 | 
15 |     public DictionaryDataSource(String dbPath) {
16 |         this.dbPath = dbPath;
17 |     }
18 | 
19 |     @Override
20 |     public Connection getConnection() throws SQLException {
21 |         try {
22 |             Class.forName("org.sqlite.JDBC");
23 |         } catch (ClassNotFoundException e) {
24 |             System.out.println(e);
25 |         }
26 |         return DriverManager.getConnection("jdbc:sqlite:" + dbPath);
27 |     }
28 | 
29 |     @Deprecated
30 |     @Override
31 |     public Connection getConnection(String username, String password) throws SQLException {
32 |         return null;
33 |     }
34 | 
35 |     @Deprecated
36 |     @Override
37 |     public PrintWriter getLogWriter() throws SQLException {
38 |         return null;
39 |     }
40 | 
41 |     @Deprecated
42 | 
43 |     @Override
44 |     public void setLogWriter(PrintWriter out) throws SQLException {
45 | 
46 |     }
47 | 
48 |     @Deprecated
49 | 
50 |     @Override
51 |     public void setLoginTimeout(int seconds) throws SQLException {
52 | 
53 |     }
54 | 
55 |     @Override
56 |     public int getLoginTimeout() throws SQLException {
57 |         return 100;
58 |     }
59 | 
60 |     @Deprecated
61 | 
62 |     public Logger getParentLogger() throws SQLFeatureNotSupportedException {
63 |         return null;
64 |     }
65 | 
66 |     @Deprecated
67 | 
68 |     @Override
69 |     public <T> T unwrap(Class<T> iface) throws SQLException {
70 |         return null;
71 |     }
72 | 
73 | 
74 |     @Override
75 |     public boolean isWrapperFor(Class<?> iface) throws SQLException {
76 |         return false;
77 |     }
78 | }
79 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/java/org/wltea/analyzer/IKAnalzyerTest.java:
--------------------------------------------------------------------------------
  1 | 
  2 | package org.wltea.analyzer;
  3 | 
  4 | import io.github.zacker330.es.ik.AbstractIntegrationTest;
  5 | import io.github.zacker330.es.ik.es.ik.analyzer.Sqlite3Configuration;
  6 | import org.apache.lucene.analysis.Analyzer;
  7 | import org.apache.lucene.analysis.TokenStream;
  8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 11 | import org.junit.Assert;
 12 | import org.junit.Test;
 13 | import org.wltea.analyzer.configuration.DictionaryConfiguration;
 14 | import org.wltea.analyzer.lucene.IKAnalyzer;
 15 | 
 16 | import java.io.IOException;
 17 | import java.io.StringReader;
 18 | 
 19 | /**
 20 |  * 使用IKAnalyzer进行分词的演示
 21 |  * 2012-10-22
 22 |  */
 23 | public class IKAnalzyerTest extends AbstractIntegrationTest {
 24 | 
 25 |     private DictionaryConfiguration configuration;
 26 | 
 27 |     @Test
 28 |     public void testAnalyzer() {
 29 |         //构建IK分词器，使用smart分词模式
 30 | 
 31 |         configuration = Sqlite3Configuration.smartModeSqlite3Configure(dbPath);
 32 |         Analyzer analyzer = new IKAnalyzer(configuration);
 33 | 
 34 |         //获取Lucene的TokenStream对象
 35 |         TokenStream tokenStream = null;
 36 |         try {
 37 |             tokenStream = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
 38 | //			ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子，你可以直接运行它！IKAnalyer can analysis english text too"));
 39 |             //获取词元位置属性
 40 |             OffsetAttribute offset = tokenStream.addAttribute(OffsetAttribute.class);
 41 |             //获取词元文本属性
 42 |             CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
 43 |             //获取词元文本属性
 44 |             TypeAttribute type = tokenStream.addAttribute(TypeAttribute.class);
 45 | 
 46 | 
 47 |             //重置TokenStream（重置StringReader）
 48 |             tokenStream.reset();
 49 | 
 50 |             tokenStream.incrementToken();
 51 |             Assert.assertEquals(0, offset.startOffset());
 52 |             Assert.assertEquals(5, offset.endOffset());
 53 |             Assert.assertEquals("ENGLISH", type.type());
 54 |             Assert.assertEquals("world", term.toString());
 55 | 
 56 | 
 57 |             tokenStream.incrementToken();
 58 |             Assert.assertEquals(10, offset.startOffset());
 59 |             Assert.assertEquals(14, offset.endOffset());
 60 |             Assert.assertEquals("ENGLISH", type.type());
 61 |             Assert.assertEquals("html", term.toString());
 62 | 
 63 | 
 64 |             tokenStream.incrementToken();
 65 |             Assert.assertEquals(15, offset.startOffset());
 66 |             Assert.assertEquals(19, offset.endOffset());
 67 |             Assert.assertEquals("ENGLISH", type.type());
 68 |             Assert.assertEquals("data", term.toString());
 69 | 
 70 |             tokenStream.incrementToken();
 71 |             Assert.assertEquals(21, offset.startOffset());
 72 |             Assert.assertEquals(25, offset.endOffset());
 73 |             Assert.assertEquals("ENGLISH", type.type());
 74 |             Assert.assertEquals("html", term.toString());
 75 | 
 76 |             tokenStream.incrementToken();
 77 |             Assert.assertEquals(26, offset.startOffset());
 78 |             Assert.assertEquals(31, offset.endOffset());
 79 |             Assert.assertEquals("ENGLISH", type.type());
 80 |             Assert.assertEquals("hello", term.toString());
 81 | 
 82 | 
 83 |             //关闭TokenStream（关闭StringReader）
 84 |             tokenStream.end();
 85 | 
 86 |         } catch (IOException e) {
 87 |             e.printStackTrace();
 88 |         } finally {
 89 |             //释放TokenStream的所有资源
 90 |             if (tokenStream != null) {
 91 |                 try {
 92 |                     tokenStream.close();
 93 |                 } catch (IOException e) {
 94 |                     e.printStackTrace();
 95 |                 }
 96 |             }
 97 |         }
 98 |     }
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/java/org/wltea/analyzer/LuceneIndexAndSearchTest.java:
--------------------------------------------------------------------------------
  1 | package org.wltea.analyzer;
  2 | 
  3 | import io.github.zacker330.es.ik.AbstractIntegrationTest;
  4 | import io.github.zacker330.es.ik.es.ik.analyzer.Sqlite3Configuration;
  5 | import org.apache.lucene.analysis.Analyzer;
  6 | import org.apache.lucene.document.Document;
  7 | import org.apache.lucene.document.Field;
  8 | import org.apache.lucene.document.StringField;
  9 | import org.apache.lucene.document.TextField;
 10 | import org.apache.lucene.index.*;
 11 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 12 | import org.apache.lucene.queryparser.classic.ParseException;
 13 | import org.apache.lucene.queryparser.classic.QueryParser;
 14 | import org.apache.lucene.search.IndexSearcher;
 15 | import org.apache.lucene.search.Query;
 16 | import org.apache.lucene.search.TopDocs;
 17 | import org.apache.lucene.store.Directory;
 18 | import org.apache.lucene.store.LockObtainFailedException;
 19 | import org.apache.lucene.store.RAMDirectory;
 20 | import org.apache.lucene.util.Version;
 21 | import org.junit.Assert;
 22 | import org.junit.Ignore;
 23 | import org.junit.Test;
 24 | import org.wltea.analyzer.lucene.IKAnalyzer;
 25 | 
 26 | import java.io.IOException;
 27 | 
 28 | public class LuceneIndexAndSearchTest extends AbstractIntegrationTest {
 29 | 
 30 | 
 31 |     @Test
 32 |     public void testLucenceIndex() {
 33 |         //Lucene Document的域名
 34 |         String fieldName = "text";
 35 |         //检索内容
 36 |         String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
 37 | 
 38 |         //实例化IKAnalyzer分词器
 39 |         Analyzer analyzer = new IKAnalyzer(Sqlite3Configuration.smartModeSqlite3Configure(dbPath));
 40 | 
 41 |         Directory directory = null;
 42 |         IndexWriter iwriter = null;
 43 |         IndexReader ireader = null;
 44 |         IndexSearcher isearcher = null;
 45 |         try {
 46 |             //建立内存索引对象
 47 |             directory = new RAMDirectory();
 48 | 
 49 |             //配置IndexWriterConfig
 50 |             IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);
 51 |             iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
 52 |             iwriter = new IndexWriter(directory, iwConfig);
 53 |             //写入索引
 54 |             Document doc = new Document();
 55 |             doc.add(new StringField("ID", "10000", Field.Store.YES));
 56 |             doc.add(new TextField(fieldName, text, Field.Store.YES));
 57 |             iwriter.addDocument(doc);
 58 |             iwriter.close();
 59 | 
 60 | 
 61 |             //搜索过程**********************************
 62 |             //实例化搜索器
 63 |             ireader = DirectoryReader.open(directory);
 64 |             isearcher = new IndexSearcher(ireader);
 65 | 
 66 |             String keyword = "中文分词工具包";
 67 |             //使用QueryParser查询分析器构造Query对象
 68 |             QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
 69 |             qp.setDefaultOperator(QueryParser.AND_OPERATOR);
 70 |             Query query = qp.parse(keyword);
 71 | 
 72 |             Assert.assertEquals(query.toString(), "+text:中文 +text:分词 +text:工具包");
 73 | 
 74 |             //搜索相似度最高的5条记录
 75 |             TopDocs topDocs = isearcher.search(query, 5);
 76 | 
 77 | 
 78 |             Assert.assertEquals(topDocs.totalHits, 1);
 79 |             Assert.assertEquals(isearcher.doc(topDocs.scoreDocs[0].doc).toString(), "Document<stored,indexed,tokenized,omitNorms,indexOptions=DOCS_ONLY<ID:10000> stored,indexed,tokenized<text:IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。>>");
 80 | 
 81 |         } catch (CorruptIndexException e) {
 82 |             e.printStackTrace();
 83 |         } catch (LockObtainFailedException e) {
 84 |             e.printStackTrace();
 85 |         } catch (IOException e) {
 86 |             e.printStackTrace();
 87 |         } catch (ParseException e) {
 88 |             e.printStackTrace();
 89 |         } finally {
 90 |             if (ireader != null) {
 91 |                 try {
 92 |                     ireader.close();
 93 |                 } catch (IOException e) {
 94 |                     e.printStackTrace();
 95 |                 }
 96 |             }
 97 |             if (directory != null) {
 98 |                 try {
 99 |                     directory.close();
100 |                 } catch (IOException e) {
101 |                     e.printStackTrace();
102 |                 }
103 |             }
104 |         }
105 |     }
106 | }
107 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/resources/database.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE main_dictionary(term TEXT NOT NULL,unique(term));
2 | 
3 | CREATE TABLE stopword_dictionary(term TEXT NOT NULL,unique(term));
4 | 
5 | CREATE TABLE quantifier_dictionary(term TEXT NOT NULL,unique(term));
6 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration>
 3 | 
 4 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 5 |         <layout class="ch.qos.logback.classic.PatternLayout">
 6 |             <Pattern>
 7 |                 %d{yyyy-MM-dd HH:mm:ss} [%thread] %-5level %logger{36} - %msg%n
 8 |             </Pattern>
 9 |         </layout>
10 |     </appender>
11 | 
12 |     <logger name="com.mkyong.web" level="debug"
13 |             additivity="false">
14 |         <appender-ref ref="STDOUT" />
15 |     </logger>
16 | 
17 |     <root level="error">
18 |         <appender-ref ref="STDOUT" />
19 |     </root>
20 | 
21 | </configuration>
22 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/resources/mainDic.properties:
--------------------------------------------------------------------------------
 1 | 这是
 2 | 中文
 3 | 分词
 4 | 例子
 5 | 结合
 6 | 词典
 7 | 文法
 8 | 开源
 9 | 工具包
10 | 使用
11 | 全新
12 | 迭代
13 | 最细
14 | 正向
15 | 粒度
16 | 切分
17 | 算法
18 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/resources/quantifierDic.properties:
--------------------------------------------------------------------------------
  1 | 丈
  2 | 下
  3 | 世
  4 | 世纪
  5 | 两
  6 | 个
  7 | 中
  8 | 串
  9 | 亩
 10 | 人
 11 | 介
 12 | 付
 13 | 代
 14 | 件
 15 | 任
 16 | 份
 17 | 伏
 18 | 伙
 19 | 位
 20 | 位数
 21 | 例
 22 | 倍
 23 | 像素
 24 | 元
 25 | 克
 26 | 克拉
 27 | 公亩
 28 | 公克
 29 | 公分
 30 | 公升
 31 | 公尺
 32 | 公担
 33 | 公斤
 34 | 公里
 35 | 公顷
 36 | 具
 37 | 册
 38 | 出
 39 | 刀
 40 | 分
 41 | 分钟
 42 | 划
 43 | 列
 44 | 则
 45 | 刻
 46 | 剂
 47 | 剑
 48 | 副
 49 | 加仑
 50 | 勺
 51 | 包
 52 | 匙
 53 | 匹
 54 | 区
 55 | 千克
 56 | 千米
 57 | 升
 58 | 卷
 59 | 厅
 60 | 厘
 61 | 双
 62 | 发
 63 | 口
 64 | 句
 65 | 只
 66 | 台
 67 | 叶
 68 | 号
 69 | 名
 70 | 吨
 71 | 听
 72 | 员
 73 | 周
 74 | 周年
 75 | 品
 76 | 回
 77 | 团
 78 | 圆
 79 | 圈
 80 | 地
 81 | 场
 82 | 块
 83 | 坪
 84 | 堆
 85 | 声
 86 | 壶
 87 | 处
 88 | 夜
 89 | 大
 90 | 天
 91 | 头
 92 | 套
 93 | 女
 94 | 孔
 95 | 字
 96 | 宗
 97 | 室
 98 | 家
 99 | 寸
100 | 对
101 | 封
102 | 尊
103 | 小时
104 | 尺
105 | 尾
106 | 局
107 | 层
108 | 届
109 | 岁
110 | 师
111 | 帧
112 | 幅
113 | 幕
114 | 幢
115 | 平方
116 | 平方公尺
117 | 平方公里
118 | 平方分米
119 | 平方厘米
120 | 平方码
121 | 平方米
122 | 平方英寸
123 | 平方英尺
124 | 平方英里
125 | 平米
126 | 年
127 | 年代
128 | 年级
129 | 度
130 | 座
131 | 式
132 | 引
133 | 张
134 | 成
135 | 战
136 | 截
137 | 户
138 | 房
139 | 所
140 | 扇
141 | 手
142 | 打
143 | 批
144 | 把
145 | 折
146 | 担
147 | 拉
148 | 拍
149 | 招
150 | 拨
151 | 拳
152 | 指
153 | 掌
154 | 排
155 | 撮
156 | 支
157 | 文
158 | 斗
159 | 斤
160 | 方
161 | 族
162 | 日
163 | 时
164 | 曲
165 | 月
166 | 月份
167 | 期
168 | 本
169 | 朵
170 | 村
171 | 束
172 | 条
173 | 来
174 | 杯
175 | 枚
176 | 枝
177 | 枪
178 | 架
179 | 柄
180 | 柜
181 | 栋
182 | 栏
183 | 株
184 | 样
185 | 根
186 | 格
187 | 案
188 | 桌
189 | 档
190 | 桩
191 | 桶
192 | 梯
193 | 棵
194 | 楼
195 | 次
196 | 款
197 | 步
198 | 段
199 | 毛
200 | 毫
201 | 池
202 | 洲
203 | 派
204 | 海里
205 | 滴
206 | 炮
207 | 点
208 | 点钟
209 | 片
210 | 版
211 | 环
212 | 班
213 | 瓣
214 | 瓶
215 | 生
216 | 男
217 | 画
218 | 界
219 | 盆
220 | 盎司
221 | 盏
222 | 盒
223 | 盘
224 | 相
225 | 眼
226 | 石
227 | 码
228 | 碗
229 | 碟
230 | 磅
231 | 种
232 | 科
233 | 秒
234 | 秒钟
235 | 窝
236 | 立方公尺
237 | 立方分米
238 | 立方厘米
239 | 立方码
240 | 立方米
241 | 立方英寸
242 | 立方英尺
243 | 站
244 | 章
245 | 笔
246 | 等
247 | 筐
248 | 筒
249 | 箱
250 | 篇
251 | 篓
252 | 篮
253 | 簇
254 | 米
255 | 类
256 | 粒
257 | 级
258 | 组
259 | 维
260 | 缕
261 | 缸
262 | 罐
263 | 网
264 | 群
265 | 股
266 | 脚
267 | 船
268 | 艇
269 | 艘
270 | 色
271 | 节
272 | 英亩
273 | 英寸
274 | 英尺
275 | 英里
276 | 行
277 | 袋
278 | 角
279 | 言
280 | 课
281 | 起
282 | 趟
283 | 路
284 | 车
285 | 转
286 | 轮
287 | 辆
288 | 辈
289 | 连
290 | 通
291 | 遍
292 | 部
293 | 里
294 | 重
295 | 针
296 | 钟
297 | 钱
298 | 锅
299 | 门
300 | 间
301 | 队
302 | 阶段
303 | 隅
304 | 集
305 | 页
306 | 顶
307 | 顷
308 | 项
309 | 顿
310 | 颗
311 | 餐
312 | 首
313 | 


--------------------------------------------------------------------------------
/es-ik-sqlite3/src/test/resources/stopwordDic.properties:
--------------------------------------------------------------------------------
 1 | a
 2 | an
 3 | and
 4 | are
 5 | as
 6 | at
 7 | be
 8 | but
 9 | by
10 | for
11 | if
12 | in
13 | into
14 | is
15 | it
16 | no
17 | not
18 | of
19 | on
20 | or
21 | such
22 | that
23 | the
24 | their
25 | then
26 | there
27 | these
28 | they
29 | this
30 | to
31 | was
32 | will
33 | with
34 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zacker330/es-ik/4fc353df3a1b1d891a5501a48c2e23a96e042383/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Thu Jun 18 17:26:55 GMT+08:00 2015
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.1-all.zip
7 | 


--------------------------------------------------------------------------------
/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 10 | DEFAULT_JVM_OPTS=""
 11 | 
 12 | APP_NAME="Gradle"
 13 | APP_BASE_NAME=`basename "$0"`
 14 | 
 15 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 16 | MAX_FD="maximum"
 17 | 
 18 | warn ( ) {
 19 |     echo "$*"
 20 | }
 21 | 
 22 | die ( ) {
 23 |     echo
 24 |     echo "$*"
 25 |     echo
 26 |     exit 1
 27 | }
 28 | 
 29 | # OS specific support (must be 'true' or 'false').
 30 | cygwin=false
 31 | msys=false
 32 | darwin=false
 33 | case "`uname`" in
 34 |   CYGWIN* )
 35 |     cygwin=true
 36 |     ;;
 37 |   Darwin* )
 38 |     darwin=true
 39 |     ;;
 40 |   MINGW* )
 41 |     msys=true
 42 |     ;;
 43 | esac
 44 | 
 45 | # For Cygwin, ensure paths are in UNIX format before anything is touched.
 46 | if $cygwin ; then
 47 |     [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"`
 48 | fi
 49 | 
 50 | # Attempt to set APP_HOME
 51 | # Resolve links: $0 may be a link
 52 | PRG="$0"
 53 | # Need this for relative symlinks.
 54 | while [ -h "$PRG" ] ; do
 55 |     ls=`ls -ld "$PRG"`
 56 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 57 |     if expr "$link" : '/.*' > /dev/null; then
 58 |         PRG="$link"
 59 |     else
 60 |         PRG=`dirname "$PRG"`"/$link"
 61 |     fi
 62 | done
 63 | SAVED="`pwd`"
 64 | cd "`dirname \"$PRG\"`/" >&-
 65 | APP_HOME="`pwd -P`"
 66 | cd "$SAVED" >&-
 67 | 
 68 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 69 | 
 70 | # Determine the Java command to use to start the JVM.
 71 | if [ -n "$JAVA_HOME" ] ; then
 72 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 73 |         # IBM's JDK on AIX uses strange locations for the executables
 74 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 75 |     else
 76 |         JAVACMD="$JAVA_HOME/bin/java"
 77 |     fi
 78 |     if [ ! -x "$JAVACMD" ] ; then
 79 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 80 | 
 81 | Please set the JAVA_HOME variable in your environment to match the
 82 | location of your Java installation."
 83 |     fi
 84 | else
 85 |     JAVACMD="java"
 86 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 87 | 
 88 | Please set the JAVA_HOME variable in your environment to match the
 89 | location of your Java installation."
 90 | fi
 91 | 
 92 | # Increase the maximum file descriptors if we can.
 93 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
 94 |     MAX_FD_LIMIT=`ulimit -H -n`
 95 |     if [ $? -eq 0 ] ; then
 96 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 97 |             MAX_FD="$MAX_FD_LIMIT"
 98 |         fi
 99 |         ulimit -n $MAX_FD
100 |         if [ $? -ne 0 ] ; then
101 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
102 |         fi
103 |     else
104 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
105 |     fi
106 | fi
107 | 
108 | # For Darwin, add options to specify how the application appears in the dock
109 | if $darwin; then
110 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
111 | fi
112 | 
113 | # For Cygwin, switch paths to Windows format before running java
114 | if $cygwin ; then
115 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
116 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
117 | 
118 |     # We build the pattern for arguments to be converted via cygpath
119 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 |     SEP=""
121 |     for dir in $ROOTDIRSRAW ; do
122 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
123 |         SEP="|"
124 |     done
125 |     OURCYGPATTERN="(^($ROOTDIRS))"
126 |     # Add a user-defined pattern to the cygpath arguments
127 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 |     fi
130 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 |     i=0
132 |     for arg in "$@" ; do
133 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
135 | 
136 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
137 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 |         else
139 |             eval `echo args$i`="\"$arg\""
140 |         fi
141 |         i=$((i+1))
142 |     done
143 |     case $i in
144 |         (0) set -- ;;
145 |         (1) set -- "$args0" ;;
146 |         (2) set -- "$args0" "$args1" ;;
147 |         (3) set -- "$args0" "$args1" "$args2" ;;
148 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 |     esac
155 | fi
156 | 
157 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
158 | function splitJvmOpts() {
159 |     JVM_OPTS=("$@")
160 | }
161 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
162 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
163 | 
164 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
165 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
12 | set DEFAULT_JVM_OPTS=
13 | 
14 | set DIRNAME=%~dp0
15 | if "%DIRNAME%" == "" set DIRNAME=.
16 | set APP_BASE_NAME=%~n0
17 | set APP_HOME=%DIRNAME%
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windowz variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | if "%@eval[2+2]" == "4" goto 4NT_args
53 | 
54 | :win9xME_args
55 | @rem Slurp the command line arguments.
56 | set CMD_LINE_ARGS=
57 | set _SKIP=2
58 | 
59 | :win9xME_args_slurp
60 | if "x%~1" == "x" goto execute
61 | 
62 | set CMD_LINE_ARGS=%*
63 | goto execute
64 | 
65 | :4NT_args
66 | @rem Get arguments from the 4NT Shell from JP Software
67 | set CMD_LINE_ARGS=%$
68 | 
69 | :execute
70 | @rem Setup the command line
71 | 
72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
73 | 
74 | @rem Execute Gradle
75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
76 | 
77 | :end
78 | @rem End local scope for the variables with windows NT shell
79 | if "%ERRORLEVEL%"=="0" goto mainEnd
80 | 
81 | :fail
82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
83 | rem the _cmd.exe /c_ return code!
84 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
85 | exit /b 1
86 | 
87 | :mainEnd
88 | if "%OS%"=="Windows_NT" endlocal
89 | 
90 | :omega
91 | 


--------------------------------------------------------------------------------
/ik-analysis-core/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'java'
 2 | 
 3 | group = "io.github.zacker330.es"
 4 | archivesBaseName = "ik-analysis-core"
 5 | version = "1.0.0"
 6 | 
 7 | repositories {
 8 |     mavenCentral()
 9 | }
10 | 
11 | dependencies {
12 |     runtime('ch.qos.logback:logback-classic:1.1.3')
13 |     testCompile('junit:junit:4.12')
14 | }
15 | 
16 | 
17 | modifyPom {
18 |     project {
19 |         name 'es-ik'
20 |         description 'Kind of Chinese Analysis for Elasticsearch'
21 |         url 'https://github.com/zacker330/es-ik'
22 |         inceptionYear '2015'
23 | 
24 |         scm {
25 |             url 'https://github.com/zacker330/es-ik'
26 |             connection 'scm:https://github.com/zacker330/es-ik.git'
27 |             developerConnection 'scm:git@github.com:zacker330/es-ik.git'
28 |         }
29 | 
30 |         licenses {
31 |             license {
32 |                 name 'The Apache Software License, Version 2.0'
33 |                 url 'http://www.apache.org/licenses/LICENSE-2.0.txt'
34 |                 distribution 'repo'
35 |             }
36 |         }
37 | 
38 |         developers {
39 |             developer {
40 |                 id 'zacker330'
41 |                 name 'Jack'
42 |                 email 'zacker330@gmail.com'
43 |             }
44 |         }
45 |     }
46 | 
47 | }
48 | 
49 | javadoc {
50 |     source = sourceSets.main.allJava
51 |     classpath = configurations.compile
52 | }
53 | 
54 | extraArchive {
55 |     sources = true
56 |     tests = true
57 |     javadoc = true
58 | }
59 | 
60 | 


--------------------------------------------------------------------------------
/ik-analysis-core/config/checkstyle/checkstyle.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <!DOCTYPE module PUBLIC
 3 |         "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
 4 |         "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
 5 | <module name="Checker">
 6 |     <module name="JavadocPackage"/>
 7 |     <module name="TreeWalker">
 8 |         <module name="AvoidStarImport"/>
 9 |         <module name="ConstantName"/>
10 |         <module name="EmptyBlock"/>
11 |     </module>
12 | </module>
13 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/configuration/DictionaryConfiguration.java:
--------------------------------------------------------------------------------
 1 | 
 2 | package org.wltea.analyzer.configuration;
 3 | 
 4 | import java.util.List;
 5 | 
 6 | public interface DictionaryConfiguration {
 7 | 	
 8 | 	
 9 | 	
10 | 	public boolean isSmartMode();
11 | 	
12 | 	public void setSmartMode(boolean useSmart);
13 | 	
14 |     List<char[]> getMainDictionary();
15 | 
16 |     List<char[]> getStopWordDictionary();
17 | 
18 |     List<char[]> getQuantifierDictionary();
19 | }
20 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java:
--------------------------------------------------------------------------------
  1 | package org.wltea.analyzer.core;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.Reader;
  5 | import java.util.HashMap;
  6 | import java.util.HashSet;
  7 | import java.util.LinkedList;
  8 | import java.util.Map;
  9 | import java.util.Set;
 10 | 
 11 | import org.wltea.analyzer.configuration.DictionaryConfiguration;
 12 | import org.wltea.analyzer.dic.Dictionary;
 13 | 
 14 | class AnalyzeContext {
 15 | 	
 16 | 	//默认缓冲区大小
 17 | 	private static final int BUFF_SIZE = 3072;
 18 | 	//缓冲区耗尽的临界值
 19 | 	private static final int BUFF_EXHAUST_CRITICAL = 48;	
 20 | 	
 21 |  
 22 | 	//字符窜读取缓冲
 23 |     private char[] segmentBuff;
 24 |     //字符类型数组
 25 |     private int[] charTypes;
 26 |     
 27 |     
 28 |     //记录Reader内已分析的字串总长度
 29 |     //在分多段分析词元时，该变量累计当前的segmentBuff相对于reader起始位置的位移
 30 | 	private int buffOffset;	
 31 |     //当前缓冲区位置指针
 32 |     private int cursor;
 33 |     //最近一次读入的,可处理的字串长度
 34 | 	private int available;
 35 | 
 36 | 	
 37 | 	//子分词器锁
 38 |     //该集合非空，说明有子分词器在占用segmentBuff
 39 |     private Set<String> buffLocker;
 40 |     
 41 |     //原始分词结果集合，未经歧义处理
 42 |     private QuickSortSet orgLexemes;    
 43 |     //LexemePath位置索引表
 44 |     private Map<Integer , LexemePath> pathMap;    
 45 |     //最终分词结果集
 46 |     private LinkedList<Lexeme> results;
 47 |     
 48 | 	//分词器配置项
 49 | 	private DictionaryConfiguration cfg;
 50 |     
 51 |     public AnalyzeContext(DictionaryConfiguration cfg){
 52 |     	this.cfg = cfg;
 53 |     	this.segmentBuff = new char[BUFF_SIZE];
 54 |     	this.charTypes = new int[BUFF_SIZE];
 55 |     	this.buffLocker = new HashSet<String>();
 56 |     	this.orgLexemes = new QuickSortSet();
 57 |     	this.pathMap = new HashMap<Integer , LexemePath>();    	
 58 |     	this.results = new LinkedList<Lexeme>();
 59 |     }
 60 |     
 61 |     int getCursor(){
 62 |     	return this.cursor;
 63 |     }
 64 | 
 65 |     char[] getSegmentBuff(){
 66 |     	return this.segmentBuff;
 67 |     }
 68 |     
 69 |     char getCurrentChar(){
 70 |     	return this.segmentBuff[this.cursor];
 71 |     }
 72 |     
 73 |     int getCurrentCharType(){
 74 |     	return this.charTypes[this.cursor];
 75 |     }
 76 |     
 77 |     int getBufferOffset(){
 78 |     	return this.buffOffset;
 79 |     }
 80 | 	
 81 |     int fillBuffer(Reader reader) throws IOException{
 82 |     	int readCount = 0;
 83 |     	if(this.buffOffset == 0){
 84 |     		//首次读取reader
 85 |     		readCount = reader.read(segmentBuff);
 86 |     	}else{
 87 |     		int offset = this.available - this.cursor;
 88 |     		if(offset > 0){
 89 |     			//最近一次读取的>最近一次处理的，将未处理的字串拷贝到segmentBuff头部
 90 |     			System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset);
 91 |     			readCount = offset;
 92 |     		}
 93 |     		//继续读取reader ，以onceReadIn - onceAnalyzed为起始位置，继续填充segmentBuff剩余的部分
 94 |     		readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset);
 95 |     	}            	
 96 |     	//记录最后一次从Reader中读入的可用字符长度
 97 |     	this.available = readCount;
 98 |     	//重置当前指针
 99 |     	this.cursor = 0;
100 |     	return readCount;
101 |     }
102 | 
103 |     void initCursor(){
104 |     	this.cursor = 0;
105 |     	this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
106 |     	this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
107 |     }
108 |     
109 |     boolean moveCursor(){
110 |     	if(this.cursor < this.available - 1){
111 |     		this.cursor++;
112 |         	this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
113 |         	this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
114 |     		return true;
115 |     	}else{
116 |     		return false;
117 |     	}
118 |     }
119 | 	
120 | 	void lockBuffer(String segmenterName){
121 | 		this.buffLocker.add(segmenterName);
122 | 	}
123 | 	
124 | 	void unlockBuffer(String segmenterName){
125 | 		this.buffLocker.remove(segmenterName);
126 | 	}
127 | 	
128 | 	boolean isBufferLocked(){
129 | 		return this.buffLocker.size() > 0;
130 | 	}
131 | 
132 | 	boolean isBufferConsumed(){
133 | 		return this.cursor == this.available - 1;
134 | 	}
135 | 	
136 | 	boolean needRefillBuffer(){
137 | 		return this.available == BUFF_SIZE 
138 | 			&& this.cursor < this.available - 1   
139 | 			&& this.cursor  > this.available - BUFF_EXHAUST_CRITICAL
140 | 			&& !this.isBufferLocked();
141 | 	}
142 | 	
143 | 	void markBufferOffset(){
144 | 		this.buffOffset += this.cursor;
145 | 	}
146 | 	
147 | 	void addLexeme(Lexeme lexeme){
148 | 		this.orgLexemes.addLexeme(lexeme);
149 | 	}
150 | 	
151 | 	void addLexemePath(LexemePath path){
152 | 		if(path != null){
153 | 			this.pathMap.put(path.getPathBegin(), path);
154 | 		}
155 | 	}
156 | 	
157 | 	
158 | 	QuickSortSet getOrgLexemes(){
159 | 		return this.orgLexemes;
160 | 	}
161 | 	
162 | 	void processUnkownCJKChar(){
163 | 		int index = 0;
164 | 		for( ; index < this.available ;){
165 | 			//跳过标点符号等字符
166 | 			if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){
167 | 				index++;
168 | 				continue;
169 | 			}
170 | 			//从pathMap找出对应index位置的LexemePath
171 | 			LexemePath path = this.pathMap.get(index);
172 | 			if(path != null){
173 | 				//输出LexemePath中的lexeme到results集合
174 | 				Lexeme l = path.pollFirst();
175 | 				while(l != null){
176 | 					this.results.add(l);
177 | 					//将index移至lexeme后
178 | 					index = l.getBegin() + l.getLength();					
179 | 					l = path.pollFirst();
180 | 					if(l != null){
181 | 						//输出path内部，词元间遗漏的单字
182 | 						for(;index < l.getBegin();index++){
183 | 							this.outputSingleCJK(index);
184 | 						}
185 | 					}
186 | 				}
187 | 			}else{//pathMap中找不到index对应的LexemePath
188 | 				//单字输出
189 | 				this.outputSingleCJK(index);
190 | 				index++;
191 | 			}
192 | 		}
193 | 		//清空当前的Map
194 | 		this.pathMap.clear();
195 | 	}
196 | 	
197 | 	private void outputSingleCJK(int index){
198 | 		if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){			
199 | 			Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR);
200 | 			this.results.add(singleCharLexeme);
201 | 		}else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){
202 | 			Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK);
203 | 			this.results.add(singleCharLexeme);
204 | 		}
205 | 	}
206 | 		
207 | 	boolean hasNextResult(){
208 | 		return !this.results.isEmpty();
209 | 	}
210 | 	
211 | 	Lexeme getNextLexeme(){
212 | 		//从结果集取出，并移除第一个Lexme
213 | 		Lexeme result = this.results.pollFirst();
214 | 		while(result != null){
215 |     		//数量词合并
216 |     		this.compound(result);
217 |     		if(Dictionary.getSingleton().isStopWord(this.segmentBuff ,  result.getBegin() , result.getLength())){
218 |        			//是停止词继续取列表的下一个
219 |     			result = this.results.pollFirst(); 				
220 |     		}else{
221 | 	 			//不是停止词, 生成lexeme的词元文本,输出
222 | 	    		result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
223 | 	    		break;
224 |     		}
225 | 		}
226 | 		return result;
227 | 	}
228 | 	
229 | 	void reset(){
230 | 		this.buffLocker.clear();
231 |         this.orgLexemes = new QuickSortSet();
232 |         this.available =0;
233 |         this.buffOffset = 0;
234 |     	this.charTypes = new int[BUFF_SIZE];
235 |     	this.cursor = 0;
236 |     	this.results.clear();
237 |     	this.segmentBuff = new char[BUFF_SIZE];
238 |     	this.pathMap.clear();
239 | 	}
240 | 	
241 | 	private void compound(Lexeme result){
242 | 		if(!this.cfg.isSmartMode()){
243 | 			return ;
244 | 		}
245 |    		//数量词合并处理
246 | 		if(!this.results.isEmpty()){
247 | 
248 | 			if(Lexeme.TYPE_ARABIC == result.getLexemeType()){
249 | 				Lexeme nextLexeme = this.results.peekFirst();
250 | 				boolean appendOk = false;
251 | 				if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){
252 | 					//合并英文数词+中文数词
253 | 					appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
254 | 				}else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
255 | 					//合并英文数词+中文量词
256 | 					appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
257 | 				}
258 | 				if(appendOk){
259 | 					//弹出
260 | 					this.results.pollFirst(); 
261 | 				}
262 | 			}
263 | 			
264 | 			//可能存在第二轮合并
265 | 			if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){
266 | 				Lexeme nextLexeme = this.results.peekFirst();
267 | 				boolean appendOk = false;
268 | 				 if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
269 | 					 //合并中文数词+中文量词
270 |  					appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
271 |  				}  
272 | 				if(appendOk){
273 | 					//弹出
274 | 					this.results.pollFirst();   				
275 | 				}
276 | 			}
277 | 
278 | 		}
279 | 	}
280 | 	
281 | }
282 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java:
--------------------------------------------------------------------------------
 1 | package org.wltea.analyzer.core;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | import org.wltea.analyzer.dic.Dictionary;
 7 | import org.wltea.analyzer.dic.Hit;
 8 | 
 9 | 
10 | class CJKSegmenter implements ISegmenter {
11 | 	
12 | 	//子分词器标签
13 | 	static final String SEGMENTER_NAME = "CJK_SEGMENTER";
14 | 	//待处理的分词hit队列
15 | 	private List<Hit> tmpHits;
16 | 	
17 | 	
18 | 	CJKSegmenter(){
19 | 		this.tmpHits = new LinkedList<Hit>();
20 | 	}
21 | 
22 | 	/* (non-Javadoc)
23 | 	 * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
24 | 	 */
25 | 	public void analyze(AnalyzeContext context) {
26 | 		if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
27 | 			
28 | 			//优先处理tmpHits中的hit
29 | 			if(!this.tmpHits.isEmpty()){
30 | 				//处理词段队列
31 | 				Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
32 | 				for(Hit hit : tmpArray){
33 | 					hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
34 | 					if(hit.isMatch()){
35 | 						//输出当前的词
36 | 						Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
37 | 						context.addLexeme(newLexeme);
38 | 						
39 | 						if(!hit.isPrefix()){//不是词前缀，hit不需要继续匹配，移除
40 | 							this.tmpHits.remove(hit);
41 | 						}
42 | 						
43 | 					}else if(hit.isUnmatch()){
44 | 						//hit不是词，移除
45 | 						this.tmpHits.remove(hit);
46 | 					}					
47 | 				}
48 | 			}			
49 | 			
50 | 			//*********************************
51 | 			//再对当前指针位置的字符进行单字匹配
52 | 			Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
53 | 			if(singleCharHit.isMatch()){//首字成词
54 | 				//输出当前的词
55 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
56 | 				context.addLexeme(newLexeme);
57 | 
58 | 				//同时也是词前缀
59 | 				if(singleCharHit.isPrefix()){
60 | 					//前缀匹配则放入hit列表
61 | 					this.tmpHits.add(singleCharHit);
62 | 				}
63 | 			}else if(singleCharHit.isPrefix()){//首字为词前缀
64 | 				//前缀匹配则放入hit列表
65 | 				this.tmpHits.add(singleCharHit);
66 | 			}
67 | 			
68 | 
69 | 		}else{
70 | 			//遇到CHAR_USELESS字符
71 | 			//清空队列
72 | 			this.tmpHits.clear();
73 | 		}
74 | 		
75 | 		//判断缓冲区是否已经读完
76 | 		if(context.isBufferConsumed()){
77 | 			//清空队列
78 | 			this.tmpHits.clear();
79 | 		}
80 | 		
81 | 		//判断是否锁定缓冲区
82 | 		if(this.tmpHits.size() == 0){
83 | 			context.unlockBuffer(SEGMENTER_NAME);
84 | 			
85 | 		}else{
86 | 			context.lockBuffer(SEGMENTER_NAME);
87 | 		}
88 | 	}
89 | 
90 | 	/* (non-Javadoc)
91 | 	 * @see org.wltea.analyzer.core.ISegmenter#reset()
92 | 	 */
93 | 	public void reset() {
94 | 		//清空队列
95 | 		this.tmpHits.clear();
96 | 	}
97 | 
98 | }
99 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java:
--------------------------------------------------------------------------------
  1 | package org.wltea.analyzer.core;
  2 | 
  3 | import java.util.HashSet;
  4 | import java.util.LinkedList;
  5 | import java.util.List;
  6 | import java.util.Set;
  7 | 
  8 | import org.wltea.analyzer.dic.Dictionary;
  9 | import org.wltea.analyzer.dic.Hit;
 10 | 
 11 | class CN_QuantifierSegmenter implements ISegmenter{
 12 | 	
 13 | 	//子分词器标签
 14 | 	static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
 15 | 	
 16 | 	//中文数词
 17 | 	private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum
 18 | 	private static Set<Character> ChnNumberChars = new HashSet<Character>();
 19 | 	static{
 20 | 		char[] ca = Chn_Num.toCharArray();
 21 | 		for(char nChar : ca){
 22 | 			ChnNumberChars.add(nChar);
 23 | 		}
 24 | 	}
 25 | 	
 26 | 	/*
 27 | 	 * 词元的开始位置，
 28 | 	 * 同时作为子分词器状态标识
 29 | 	 * 当start > -1 时，标识当前的分词器正在处理字符
 30 | 	 */
 31 | 	private int nStart;
 32 | 	/*
 33 | 	 * 记录词元结束位置
 34 | 	 * end记录的是在词元中最后一个出现的合理的数词结束
 35 | 	 */
 36 | 	private int nEnd;
 37 | 
 38 | 	//待处理的量词hit队列
 39 | 	private List<Hit> countHits;
 40 | 	
 41 | 	
 42 | 	CN_QuantifierSegmenter(){
 43 | 		nStart = -1;
 44 | 		nEnd = -1;
 45 | 		this.countHits  = new LinkedList<Hit>();
 46 | 	}
 47 | 	
 48 | 	public void analyze(AnalyzeContext context) {
 49 | 		//处理中文数词
 50 | 		this.processCNumber(context);
 51 | 		//处理中文量词
 52 | 		this.processCount(context);
 53 | 		
 54 | 		//判断是否锁定缓冲区
 55 | 		if(this.nStart == -1 && this.nEnd == -1	&& countHits.isEmpty()){
 56 | 			//对缓冲区解锁
 57 | 			context.unlockBuffer(SEGMENTER_NAME);
 58 | 		}else{
 59 | 			context.lockBuffer(SEGMENTER_NAME);
 60 | 		}
 61 | 	}
 62 | 	
 63 | 
 64 | 	public void reset() {
 65 | 		nStart = -1;
 66 | 		nEnd = -1;
 67 | 		countHits.clear();
 68 | 	}
 69 | 	
 70 | 	private void processCNumber(AnalyzeContext context){
 71 | 		if(nStart == -1 && nEnd == -1){//初始状态
 72 | 			if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 
 73 | 					&& ChnNumberChars.contains(context.getCurrentChar())){
 74 | 				//记录数词的起始、结束位置
 75 | 				nStart = context.getCursor();
 76 | 				nEnd = context.getCursor();
 77 | 			}
 78 | 		}else{//正在处理状态
 79 | 			if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 
 80 | 					&& ChnNumberChars.contains(context.getCurrentChar())){
 81 | 				//记录数词的结束位置
 82 | 				nEnd = context.getCursor();
 83 | 			}else{
 84 | 				//输出数词
 85 | 				this.outputNumLexeme(context);
 86 | 				//重置头尾指针
 87 | 				nStart = -1;
 88 | 				nEnd = -1;
 89 | 			}
 90 | 		}
 91 | 		
 92 | 		//缓冲区已经用完，还有尚未输出的数词
 93 | 		if(context.isBufferConsumed()){
 94 | 			if(nStart != -1 && nEnd != -1){
 95 | 				//输出数词
 96 | 				outputNumLexeme(context);
 97 | 				//重置头尾指针
 98 | 				nStart = -1;
 99 | 				nEnd = -1;
100 | 			}
101 | 		}	
102 | 	}
103 | 	
104 | 	private void processCount(AnalyzeContext context){
105 | 		// 判断是否需要启动量词扫描
106 | 		if(!this.needCountScan(context)){
107 | 			return;
108 | 		}
109 | 		
110 | 		if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
111 | 			
112 | 			//优先处理countHits中的hit
113 | 			if(!this.countHits.isEmpty()){
114 | 				//处理词段队列
115 | 				Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
116 | 				for(Hit hit : tmpArray){
117 | 					hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
118 | 					if(hit.isMatch()){
119 | 						//输出当前的词
120 | 						Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
121 | 						context.addLexeme(newLexeme);
122 | 						
123 | 						if(!hit.isPrefix()){//不是词前缀，hit不需要继续匹配，移除
124 | 							this.countHits.remove(hit);
125 | 						}
126 | 						
127 | 					}else if(hit.isUnmatch()){
128 | 						//hit不是词，移除
129 | 						this.countHits.remove(hit);
130 | 					}					
131 | 				}
132 | 			}				
133 | 
134 | 			//*********************************
135 | 			//对当前指针位置的字符进行单字匹配
136 | 			Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
137 | 			if(singleCharHit.isMatch()){//首字成量词词
138 | 				//输出当前的词
139 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
140 | 				context.addLexeme(newLexeme);
141 | 
142 | 				//同时也是词前缀
143 | 				if(singleCharHit.isPrefix()){
144 | 					//前缀匹配则放入hit列表
145 | 					this.countHits.add(singleCharHit);
146 | 				}
147 | 			}else if(singleCharHit.isPrefix()){//首字为量词前缀
148 | 				//前缀匹配则放入hit列表
149 | 				this.countHits.add(singleCharHit);
150 | 			}
151 | 			
152 | 			
153 | 		}else{
154 | 			//输入的不是中文字符
155 | 			//清空未成形的量词
156 | 			this.countHits.clear();
157 | 		}
158 | 		
159 | 		//缓冲区数据已经读完，还有尚未输出的量词
160 | 		if(context.isBufferConsumed()){
161 | 			//清空未成形的量词
162 | 			this.countHits.clear();
163 | 		}
164 | 	}
165 | 	
166 | 	private boolean needCountScan(AnalyzeContext context){
167 | 		if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
168 | 			//正在处理中文数词,或者正在处理量词
169 | 			return true;
170 | 		}else{
171 | 			//找到一个相邻的数词
172 | 			if(!context.getOrgLexemes().isEmpty()){
173 | 				Lexeme l = context.getOrgLexemes().peekLast();
174 | 				if(Lexeme.TYPE_CNUM == l.getLexemeType() ||  Lexeme.TYPE_ARABIC == l.getLexemeType()){
175 | 					if(l.getBegin() + l.getLength() == context.getCursor()){
176 | 						return true;
177 | 					}
178 | 				}
179 | 			}
180 | 		}
181 | 		return false;
182 | 	}
183 | 	
184 | 	private void outputNumLexeme(AnalyzeContext context){
185 | 		if(nStart > -1 && nEnd > -1){
186 | 			//输出数词
187 | 			Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
188 | 			context.addLexeme(newLexeme);
189 | 			
190 | 		}
191 | 	}
192 | 
193 | }
194 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/CharacterUtil.java:
--------------------------------------------------------------------------------
 1 | package org.wltea.analyzer.core;
 2 | 
 3 | class CharacterUtil {
 4 | 	
 5 | 	public static final int CHAR_USELESS = 0;
 6 | 	
 7 | 	public static final int CHAR_ARABIC = 0X00000001;
 8 | 	
 9 | 	public static final int CHAR_ENGLISH = 0X00000002;
10 | 	
11 | 	public static final int CHAR_CHINESE = 0X00000004;
12 | 	
13 | 	public static final int CHAR_OTHER_CJK = 0X00000008;
14 | 	
15 | 	
16 | 	static int identifyCharType(char input){
17 | 		if(input >= '0' && input <= '9'){
18 | 			return CHAR_ARABIC;
19 | 			
20 | 		}else if((input >= 'a' && input <= 'z')
21 | 				|| (input >= 'A' && input <= 'Z')){
22 | 			return CHAR_ENGLISH;
23 | 			
24 | 		}else {
25 | 			Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
26 | 			
27 | 			if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS  
28 | 					|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS  
29 | 					|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
30 | 				//目前已知的中文字符UTF-8集合
31 | 				return CHAR_CHINESE;
32 | 				
33 | 			}else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
34 | 					//韩文字符集
35 | 					|| ub == Character.UnicodeBlock.HANGUL_SYLLABLES 
36 | 					|| ub == Character.UnicodeBlock.HANGUL_JAMO
37 | 					|| ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
38 | 					//日文字符集
39 | 					|| ub == Character.UnicodeBlock.HIRAGANA //平假名
40 | 					|| ub == Character.UnicodeBlock.KATAKANA //片假名
41 | 					|| ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
42 | 				return CHAR_OTHER_CJK;
43 | 				
44 | 			}
45 | 		}
46 | 		//其他的不做处理的字符
47 | 		return CHAR_USELESS;
48 | 	}
49 | 	
50 | 	/**
51 | 	 * 进行字符规格化（全角转半角，大写转小写处理）
52 | 	 * @param input
53 | 	 * @return char
54 | 	 */
55 | 	static char regularize(char input){
56 |         if (input == 12288) {
57 |             input = (char) 32;
58 |             
59 |         }else if (input > 65280 && input < 65375) {
60 |             input = (char) (input - 65248);
61 |             
62 |         }else if (input >= 'A' && input <= 'Z') {
63 |         	input += 32;
64 | 		}
65 |         
66 |         return input;
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/IKArbitrator.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  */
 25 | package org.wltea.analyzer.core;
 26 | 
 27 | import java.util.Stack;
 28 | import java.util.TreeSet;
 29 | 
 30 | /**
 31 |  * IK分词歧义裁决器
 32 |  */
 33 | class IKArbitrator {
 34 | 
 35 | 	IKArbitrator(){
 36 | 		
 37 | 	}
 38 | 	
 39 | 	void process(AnalyzeContext context , boolean useSmart){
 40 | 		QuickSortSet orgLexemes = context.getOrgLexemes();
 41 | 		Lexeme orgLexeme = orgLexemes.pollFirst();
 42 | 		
 43 | 		LexemePath crossPath = new LexemePath();
 44 | 		while(orgLexeme != null){
 45 | 			if(!crossPath.addCrossLexeme(orgLexeme)){
 46 | 				//找到与crossPath不相交的下一个crossPath	
 47 | 				if(crossPath.size() == 1 || !useSmart){
 48 | 					//crossPath没有歧义 或者 不做歧义处理
 49 | 					//直接输出当前crossPath
 50 | 					context.addLexemePath(crossPath);
 51 | 				}else{
 52 | 					//对当前的crossPath进行歧义处理
 53 | 					QuickSortSet.Cell headCell = crossPath.getHead();
 54 | 					LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
 55 | 					//输出歧义处理结果judgeResult
 56 | 					context.addLexemePath(judgeResult);
 57 | 				}
 58 | 				
 59 | 				//把orgLexeme加入新的crossPath中
 60 | 				crossPath = new LexemePath();
 61 | 				crossPath.addCrossLexeme(orgLexeme);
 62 | 			}
 63 | 			orgLexeme = orgLexemes.pollFirst();
 64 | 		}
 65 | 		
 66 | 		
 67 | 		//处理最后的path
 68 | 		if(crossPath.size() == 1 || !useSmart){
 69 | 			//crossPath没有歧义 或者 不做歧义处理
 70 | 			//直接输出当前crossPath
 71 | 			context.addLexemePath(crossPath);
 72 | 		}else{
 73 | 			//对当前的crossPath进行歧义处理
 74 | 			QuickSortSet.Cell headCell = crossPath.getHead();
 75 | 			LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
 76 | 			//输出歧义处理结果judgeResult
 77 | 			context.addLexemePath(judgeResult);
 78 | 		}
 79 | 	}
 80 | 	private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
 81 | 		//候选路径集合
 82 | 		TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
 83 | 		//候选结果路径
 84 | 		LexemePath option = new LexemePath();
 85 | 		
 86 | 		//对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
 87 | 		Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell , option);
 88 | 		
 89 | 		//当前词元链并非最理想的，加入候选路径集合
 90 | 		pathOptions.add(option.copy());
 91 | 		
 92 | 		//存在歧义词，处理
 93 | 		QuickSortSet.Cell c = null;
 94 | 		while(!lexemeStack.isEmpty()){
 95 | 			c = lexemeStack.pop();
 96 | 			//回滚词元链
 97 | 			this.backPath(c.getLexeme() , option);
 98 | 			//从歧义词位置开始，递归，生成可选方案
 99 | 			this.forwardPath(c , option);
100 | 			pathOptions.add(option.copy());
101 | 		}
102 | 		
103 | 		//返回集合中的最优方案
104 | 		return pathOptions.first();
105 | 
106 | 	}
107 | 	
108 | 	private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
109 | 		//发生冲突的Lexeme栈
110 | 		Stack<QuickSortSet.Cell> conflictStack = new Stack<QuickSortSet.Cell>();
111 | 		QuickSortSet.Cell c = lexemeCell;
112 | 		//迭代遍历Lexeme链表
113 | 		while(c != null && c.getLexeme() != null){
114 | 			if(!option.addNotCrossLexeme(c.getLexeme())){
115 | 				//词元交叉，添加失败则加入lexemeStack栈
116 | 				conflictStack.push(c);
117 | 			}
118 | 			c = c.getNext();
119 | 		}
120 | 		return conflictStack;
121 | 	}
122 | 	
123 | 	private void backPath(Lexeme l  , LexemePath option){
124 | 		while(option.checkCross(l)){
125 | 			option.removeTail();
126 | 		}
127 | 		
128 | 	}
129 | 	
130 | }
131 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/IKSegmenter.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  *
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  */
 24 | package org.wltea.analyzer.core;
 25 | 
 26 | import org.wltea.analyzer.configuration.DictionaryConfiguration;
 27 | import org.wltea.analyzer.dic.Dictionary;
 28 | 
 29 | import java.io.IOException;
 30 | import java.io.Reader;
 31 | import java.util.ArrayList;
 32 | import java.util.List;
 33 | 
 34 | /**
 35 |  * IK分词器主类
 36 |  */
 37 | public final class IKSegmenter {
 38 | 
 39 |     //字符窜reader
 40 |     private Reader input;
 41 |     //分词器配置项
 42 |     private DictionaryConfiguration cfg;
 43 |     //分词器上下文
 44 |     private AnalyzeContext context;
 45 |     //分词处理器列表
 46 |     private List<ISegmenter> segmenters;
 47 |     //分词歧义裁决器
 48 |     private IKArbitrator arbitrator;
 49 | 
 50 | 
 51 |     public IKSegmenter(Reader input, DictionaryConfiguration cfg) {
 52 |         this.input = input;
 53 |         this.cfg = cfg;
 54 | 
 55 |         //初始化词典单例
 56 |         Dictionary.initial(this.cfg);
 57 |         //初始化分词上下文
 58 |         this.context = new AnalyzeContext(this.cfg);
 59 |         //加载子分词器
 60 |         this.segmenters = this.loadSegmenters();
 61 |         //加载歧义裁决器
 62 |         this.arbitrator = new IKArbitrator();
 63 |     }
 64 | 
 65 | 
 66 |     private List<ISegmenter> loadSegmenters() {
 67 |         List<ISegmenter> segmenters = new ArrayList<ISegmenter>(4);
 68 |         //处理字母的子分词器
 69 |         segmenters.add(new LetterSegmenter());
 70 |         //处理中文数量词的子分词器
 71 |         segmenters.add(new CN_QuantifierSegmenter());
 72 |         //处理中文词的子分词器
 73 |         segmenters.add(new CJKSegmenter());
 74 |         return segmenters;
 75 |     }
 76 | 
 77 |     public synchronized Lexeme next() throws IOException {
 78 |         if (this.context.hasNextResult()) {
 79 |             //存在尚未输出的分词结果
 80 |             return this.context.getNextLexeme();
 81 |         } else {
 82 |             /*
 83 | 			 * 从reader中读取数据，填充buffer
 84 | 			 * 如果reader是分次读入buffer的，那么buffer要进行移位处理
 85 | 			 * 移位处理上次读入的但未处理的数据
 86 | 			 */
 87 |             int available = context.fillBuffer(this.input);
 88 |             if (available <= 0) {
 89 |                 //reader已经读完
 90 |                 context.reset();
 91 |                 return null;
 92 | 
 93 |             } else {
 94 |                 //初始化指针
 95 |                 context.initCursor();
 96 |                 do {
 97 |                     //遍历子分词器
 98 |                     for (ISegmenter segmenter : segmenters) {
 99 |                         segmenter.analyze(context);
100 |                     }
101 |                     //字符缓冲区接近读完，需要读入新的字符
102 |                     if (context.needRefillBuffer()) {
103 |                         break;
104 |                     }
105 |                     //向前移动指针
106 |                 } while (context.moveCursor());
107 |                 //重置子分词器，为下轮循环进行初始化
108 |                 for (ISegmenter segmenter : segmenters) {
109 |                     segmenter.reset();
110 |                 }
111 |             }
112 |             //对分词进行歧义处理
113 |             this.arbitrator.process(context, this.cfg.isSmartMode());
114 |             //处理未切分CJK字符
115 |             context.processUnkownCJKChar();
116 |             //记录本次分词的缓冲区位移
117 |             context.markBufferOffset();
118 |             //输出词元
119 |             if (this.context.hasNextResult()) {
120 |                 return this.context.getNextLexeme();
121 |             }
122 |             return null;
123 |         }
124 |     }
125 | 
126 |     public synchronized void reset(Reader input) {
127 |         this.input = input;
128 |         context.reset();
129 |         for (ISegmenter segmenter : segmenters) {
130 |             segmenter.reset();
131 |         }
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/ISegmenter.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * IK 中文分词  版本 5.0
 3 |  * IK Analyzer release 5.0
 4 |  * 
 5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 6 |  * contributor license agreements.  See the NOTICE file distributed with
 7 |  * this work for additional information regarding copyright ownership.
 8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 9 |  * (the "License"); you may not use this file except in compliance with
10 |  * the License.  You may obtain a copy of the License at
11 |  *
12 |  *     http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  *
20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
21 |  * 版权声明 2012，乌龙茶工作室
22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
23 |  * 
24 |  */
25 | package org.wltea.analyzer.core;
26 | 
27 | 
28 | /**
29 |  * 
30 |  * 子分词器接口
31 |  */
32 | interface ISegmenter {
33 | 	
34 | 	void analyze(AnalyzeContext context);
35 | 	
36 | 	
37 | 	/**
38 | 	 * 重置子分析器状态
39 | 	 */
40 | 	void reset();
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java:
--------------------------------------------------------------------------------
  1 | package org.wltea.analyzer.core;
  2 | 
  3 | import java.util.Arrays;
  4 | 
  5 | class LetterSegmenter implements ISegmenter {
  6 | 	
  7 | 	//子分词器标签
  8 | 	static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
  9 | 	//链接符号
 10 | 	private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'};
 11 | 	
 12 | 	//数字符号
 13 | 	private static final char[] Num_Connector = new char[]{',' , '.'};
 14 | 	
 15 | 	/*
 16 | 	 * 词元的开始位置，
 17 | 	 * 同时作为子分词器状态标识
 18 | 	 * 当start > -1 时，标识当前的分词器正在处理字符
 19 | 	 */
 20 | 	private int start;
 21 | 	/*
 22 | 	 * 记录词元结束位置
 23 | 	 * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
 24 | 	 */
 25 | 	private int end;
 26 | 	
 27 | 	/*
 28 | 	 * 字母起始位置
 29 | 	 */
 30 | 	private int englishStart;
 31 | 
 32 | 	/*
 33 | 	 * 字母结束位置
 34 | 	 */
 35 | 	private int englishEnd;
 36 | 	
 37 | 	/*
 38 | 	 * 阿拉伯数字起始位置
 39 | 	 */
 40 | 	private int arabicStart;
 41 | 	
 42 | 	/*
 43 | 	 * 阿拉伯数字结束位置
 44 | 	 */
 45 | 	private int arabicEnd;
 46 | 	
 47 | 	LetterSegmenter(){
 48 | 		Arrays.sort(Letter_Connector);
 49 | 		Arrays.sort(Num_Connector);
 50 | 		this.start = -1;
 51 | 		this.end = -1;
 52 | 		this.englishStart = -1;
 53 | 		this.englishEnd = -1;
 54 | 		this.arabicStart = -1;
 55 | 		this.arabicEnd = -1;
 56 | 	}
 57 | 
 58 | 
 59 | 	/* (non-Javadoc)
 60 | 	 * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
 61 | 	 */
 62 | 	public void analyze(AnalyzeContext context) {
 63 | 		boolean bufferLockFlag = false;
 64 | 		//处理英文字母
 65 | 		bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
 66 | 		//处理阿拉伯字母
 67 | 		bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
 68 | 		//处理混合字母
 69 | 		bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
 70 | 		
 71 | 		//判断是否锁定缓冲区
 72 | 		if(bufferLockFlag){
 73 | 			context.lockBuffer(SEGMENTER_NAME);
 74 | 		}else{
 75 | 			//对缓冲区解锁
 76 | 			context.unlockBuffer(SEGMENTER_NAME);
 77 | 		}
 78 | 	}
 79 | 	
 80 | 	/* (non-Javadoc)
 81 | 	 * @see org.wltea.analyzer.core.ISegmenter#reset()
 82 | 	 */
 83 | 	public void reset() {
 84 | 		this.start = -1;
 85 | 		this.end = -1;
 86 | 		this.englishStart = -1;
 87 | 		this.englishEnd = -1;
 88 | 		this.arabicStart = -1;
 89 | 		this.arabicEnd = -1;
 90 | 	}	
 91 | 	
 92 | 	private boolean processMixLetter(AnalyzeContext context){
 93 | 		boolean needLock = false;
 94 | 		
 95 | 		if(this.start == -1){//当前的分词器尚未开始处理字符
 96 | 			if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
 97 | 					|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
 98 | 				//记录起始指针的位置,标明分词器进入处理状态
 99 | 				this.start = context.getCursor();
100 | 				this.end = start;
101 | 			}
102 | 			
103 | 		}else{//当前的分词器正在处理字符			
104 | 			if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
105 | 					|| CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
106 | 				//记录下可能的结束位置
107 | 				this.end = context.getCursor();
108 | 				
109 | 			}else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
110 | 						&& this.isLetterConnector(context.getCurrentChar())){
111 | 				//记录下可能的结束位置
112 | 				this.end = context.getCursor();
113 | 			}else{
114 | 				//遇到非Letter字符，输出词元
115 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
116 | 				context.addLexeme(newLexeme);
117 | 				this.start = -1;
118 | 				this.end = -1;
119 | 			}			
120 | 		}
121 | 		
122 | 		//判断缓冲区是否已经读完
123 | 		if(context.isBufferConsumed()){
124 | 			if(this.start != -1 && this.end != -1){
125 | 				//缓冲以读完，输出词元
126 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
127 | 				context.addLexeme(newLexeme);
128 | 				this.start = -1;
129 | 				this.end = -1;
130 | 			}
131 | 		}
132 | 		
133 | 		//判断是否锁定缓冲区
134 | 		if(this.start == -1 && this.end == -1){
135 | 			//对缓冲区解锁
136 | 			needLock = false;
137 | 		}else{
138 | 			needLock = true;
139 | 		}
140 | 		return needLock;
141 | 	}
142 | 	
143 | 	private boolean processEnglishLetter(AnalyzeContext context){
144 | 		boolean needLock = false;
145 | 		
146 | 		if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符	
147 | 			if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
148 | 				//记录起始指针的位置,标明分词器进入处理状态
149 | 				this.englishStart = context.getCursor();
150 | 				this.englishEnd = this.englishStart;
151 | 			}
152 | 		}else {//当前的分词器正在处理英文字符	
153 | 			if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
154 | 				//记录当前指针位置为结束位置
155 | 				this.englishEnd =  context.getCursor();
156 | 			}else{
157 | 				//遇到非English字符,输出词元
158 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
159 | 				context.addLexeme(newLexeme);
160 | 				this.englishStart = -1;
161 | 				this.englishEnd= -1;
162 | 			}
163 | 		}
164 | 		
165 | 		//判断缓冲区是否已经读完
166 | 		if(context.isBufferConsumed()){
167 | 			if(this.englishStart != -1 && this.englishEnd != -1){
168 | 				//缓冲以读完，输出词元
169 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
170 | 				context.addLexeme(newLexeme);
171 | 				this.englishStart = -1;
172 | 				this.englishEnd= -1;
173 | 			}
174 | 		}	
175 | 		
176 | 		//判断是否锁定缓冲区
177 | 		if(this.englishStart == -1 && this.englishEnd == -1){
178 | 			//对缓冲区解锁
179 | 			needLock = false;
180 | 		}else{
181 | 			needLock = true;
182 | 		}
183 | 		return needLock;			
184 | 	}
185 | 	
186 | 	private boolean processArabicLetter(AnalyzeContext context){
187 | 		boolean needLock = false;
188 | 		
189 | 		if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符	
190 | 			if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
191 | 				//记录起始指针的位置,标明分词器进入处理状态
192 | 				this.arabicStart = context.getCursor();
193 | 				this.arabicEnd = this.arabicStart;
194 | 			}
195 | 		}else {//当前的分词器正在处理数字字符	
196 | 			if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
197 | 				//记录当前指针位置为结束位置
198 | 				this.arabicEnd = context.getCursor();
199 | 			}else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
200 | 					&& this.isNumConnector(context.getCurrentChar())){
201 | 				//不输出数字，但不标记结束
202 | 			}else{
203 | 				////遇到非Arabic字符,输出词元
204 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
205 | 				context.addLexeme(newLexeme);
206 | 				this.arabicStart = -1;
207 | 				this.arabicEnd = -1;
208 | 			}
209 | 		}
210 | 		
211 | 		//判断缓冲区是否已经读完
212 | 		if(context.isBufferConsumed()){
213 | 			if(this.arabicStart != -1 && this.arabicEnd != -1){
214 | 				//生成已切分的词元
215 | 				Lexeme newLexeme = new Lexeme(context.getBufferOffset() ,  this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
216 | 				context.addLexeme(newLexeme);
217 | 				this.arabicStart = -1;
218 | 				this.arabicEnd = -1;
219 | 			}
220 | 		}
221 | 		
222 | 		//判断是否锁定缓冲区
223 | 		if(this.arabicStart == -1 && this.arabicEnd == -1){
224 | 			//对缓冲区解锁
225 | 			needLock = false;
226 | 		}else{
227 | 			needLock = true;
228 | 		}
229 | 		return needLock;		
230 | 	}	
231 | 
232 | 	private boolean isLetterConnector(char input){
233 | 		int index = Arrays.binarySearch(Letter_Connector, input);
234 | 		return index >= 0;
235 | 	}
236 | 	
237 | 	private boolean isNumConnector(char input){
238 | 		int index = Arrays.binarySearch(Num_Connector, input);
239 | 		return index >= 0;
240 | 	}
241 | }
242 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/Lexeme.java:
--------------------------------------------------------------------------------
  1 | package org.wltea.analyzer.core;
  2 | 
  3 | public class Lexeme implements Comparable<Lexeme> {
  4 |     //lexemeType常量
  5 |     //未知
  6 |     public static final int TYPE_UNKNOWN = 0;
  7 |     //英文
  8 |     public static final int TYPE_ENGLISH = 1;
  9 |     //数字
 10 |     public static final int TYPE_ARABIC = 2;
 11 |     //英文数字混合
 12 |     public static final int TYPE_LETTER = 3;
 13 |     //中文词元
 14 |     public static final int TYPE_CNWORD = 4;
 15 |     //中文单字
 16 |     public static final int TYPE_CNCHAR = 64;
 17 |     //日韩文字
 18 |     public static final int TYPE_OTHER_CJK = 8;
 19 |     //中文数词
 20 |     public static final int TYPE_CNUM = 16;
 21 |     //中文量词
 22 |     public static final int TYPE_COUNT = 32;
 23 |     //中文数量词
 24 |     public static final int TYPE_CQUAN = 48;
 25 | 
 26 |     //词元的起始位移
 27 |     private int offset;
 28 |     //词元的相对起始位置
 29 |     private int begin;
 30 |     //词元的长度
 31 |     private int length;
 32 |     //词元文本
 33 |     private String lexemeText;
 34 |     //词元类型
 35 |     private int lexemeType;
 36 | 
 37 | 
 38 |     public Lexeme(int offset, int begin, int length, int lexemeType) {
 39 |         this.offset = offset;
 40 |         this.begin = begin;
 41 |         if (length < 0) {
 42 |             throw new IllegalArgumentException("length < 0");
 43 |         }
 44 |         this.length = length;
 45 |         this.lexemeType = lexemeType;
 46 |     }
 47 | 
 48 |     /*
 49 |      * 判断词元相等算法
 50 |      * 起始位置偏移、起始位置、终止位置相同
 51 |      * @see java.lang.Object#equals(Object o)
 52 |      */
 53 |     public boolean equals(Object o) {
 54 |         if (o == null) {
 55 |             return false;
 56 |         }
 57 | 
 58 |         if (this == o) {
 59 |             return true;
 60 |         }
 61 | 
 62 |         if (o instanceof Lexeme) {
 63 |             Lexeme other = (Lexeme) o;
 64 |             if (this.offset == other.getOffset()
 65 |                     && this.begin == other.getBegin()
 66 |                     && this.length == other.getLength()) {
 67 |                 return true;
 68 |             } else {
 69 |                 return false;
 70 |             }
 71 |         } else {
 72 |             return false;
 73 |         }
 74 |     }
 75 | 
 76 |     /*
 77 |      * 词元哈希编码算法
 78 |      * @see java.lang.Object#hashCode()
 79 |      */
 80 |     public int hashCode() {
 81 |         int absBegin = getBeginPosition();
 82 |         int absEnd = getEndPosition();
 83 |         return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
 84 |     }
 85 | 
 86 |     /*
 87 |      * 词元在排序集合中的比较算法
 88 |      * @see java.lang.Comparable#compareTo(java.lang.Object)
 89 |      */
 90 |     public int compareTo(Lexeme other) {
 91 |         //起始位置优先
 92 |         if (this.begin < other.getBegin()) {
 93 |             return -1;
 94 |         } else if (this.begin == other.getBegin()) {
 95 |             //词元长度优先
 96 |             if (this.length > other.getLength()) {
 97 |                 return -1;
 98 |             } else if (this.length == other.getLength()) {
 99 |                 return 0;
100 |             } else {//this.length < other.getLength()
101 |                 return 1;
102 |             }
103 | 
104 |         } else {//this.begin > other.getBegin()
105 |             return 1;
106 |         }
107 |     }
108 | 
109 |     public int getOffset() {
110 |         return offset;
111 |     }
112 | 
113 |     public void setOffset(int offset) {
114 |         this.offset = offset;
115 |     }
116 | 
117 |     public int getBegin() {
118 |         return begin;
119 |     }
120 | 
121 |     public int getBeginPosition() {
122 |         return offset + begin;
123 |     }
124 | 
125 |     public void setBegin(int begin) {
126 |         this.begin = begin;
127 |     }
128 | 
129 |     public int getEndPosition() {
130 |         return offset + begin + length;
131 |     }
132 | 
133 |     public int getLength() {
134 |         return this.length;
135 |     }
136 | 
137 |     public void setLength(int length) {
138 |         if (this.length < 0) {
139 |             throw new IllegalArgumentException("length < 0");
140 |         }
141 |         this.length = length;
142 |     }
143 | 
144 |     public String getLexemeText() {
145 |         if (lexemeText == null) {
146 |             return "";
147 |         }
148 |         return lexemeText;
149 |     }
150 | 
151 |     public void setLexemeText(String lexemeText) {
152 |         if (lexemeText == null) {
153 |             this.lexemeText = "";
154 |             this.length = 0;
155 |         } else {
156 |             this.lexemeText = lexemeText;
157 |             this.length = lexemeText.length();
158 |         }
159 |     }
160 | 
161 |     public int getLexemeType() {
162 |         return lexemeType;
163 |     }
164 | 
165 |     public String getLexemeTypeString() {
166 |         switch (lexemeType) {
167 | 
168 |             case TYPE_ENGLISH:
169 |                 return "ENGLISH";
170 | 
171 |             case TYPE_ARABIC:
172 |                 return "ARABIC";
173 | 
174 |             case TYPE_LETTER:
175 |                 return "LETTER";
176 | 
177 |             case TYPE_CNWORD:
178 |                 return "CN_WORD";
179 | 
180 |             case TYPE_CNCHAR:
181 |                 return "CN_CHAR";
182 | 
183 |             case TYPE_OTHER_CJK:
184 |                 return "OTHER_CJK";
185 | 
186 |             case TYPE_COUNT:
187 |                 return "COUNT";
188 | 
189 |             case TYPE_CNUM:
190 |                 return "TYPE_CNUM";
191 | 
192 |             case TYPE_CQUAN:
193 |                 return "TYPE_CQUAN";
194 | 
195 |             default:
196 |                 return "UNKONW";
197 |         }
198 |     }
199 | 
200 | 
201 |     public void setLexemeType(int lexemeType) {
202 |         this.lexemeType = lexemeType;
203 |     }
204 | 
205 |     public boolean append(Lexeme l, int lexemeType) {
206 |         if (l != null && this.getEndPosition() == l.getBeginPosition()) {
207 |             this.length += l.getLength();
208 |             this.lexemeType = lexemeType;
209 |             return true;
210 |         } else {
211 |             return false;
212 |         }
213 |     }
214 | 
215 | 
216 |     public String toString() {
217 |         StringBuffer strbuf = new StringBuffer();
218 |         strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
219 |         strbuf.append(" : ").append(this.lexemeText).append(" : \t");
220 |         strbuf.append(this.getLexemeTypeString());
221 |         return strbuf.toString();
222 |     }
223 | 
224 | 
225 | }
226 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/LexemePath.java:
--------------------------------------------------------------------------------
  1 | package org.wltea.analyzer.core;
  2 | 
  3 | 
  4 | class LexemePath extends QuickSortSet implements Comparable<LexemePath> {
  5 | 
  6 |     //起始位置
  7 |     private int pathBegin;
  8 |     //结束
  9 |     private int pathEnd;
 10 |     //词元链的有效字符长度
 11 |     private int payloadLength;
 12 | 
 13 |     LexemePath() {
 14 |         this.pathBegin = -1;
 15 |         this.pathEnd = -1;
 16 |         this.payloadLength = 0;
 17 |     }
 18 | 
 19 |     boolean addCrossLexeme(Lexeme lexeme) {
 20 |         if (this.isEmpty()) {
 21 |             this.addLexeme(lexeme);
 22 |             this.pathBegin = lexeme.getBegin();
 23 |             this.pathEnd = lexeme.getBegin() + lexeme.getLength();
 24 |             this.payloadLength += lexeme.getLength();
 25 |             return true;
 26 | 
 27 |         } else if (this.checkCross(lexeme)) {
 28 |             this.addLexeme(lexeme);
 29 |             if (lexeme.getBegin() + lexeme.getLength() > this.pathEnd) {
 30 |                 this.pathEnd = lexeme.getBegin() + lexeme.getLength();
 31 |             }
 32 |             this.payloadLength = this.pathEnd - this.pathBegin;
 33 |             return true;
 34 | 
 35 |         } else {
 36 |             return false;
 37 | 
 38 |         }
 39 |     }
 40 | 
 41 |     boolean addNotCrossLexeme(Lexeme lexeme) {
 42 |         if (this.isEmpty()) {
 43 |             this.addLexeme(lexeme);
 44 |             this.pathBegin = lexeme.getBegin();
 45 |             this.pathEnd = lexeme.getBegin() + lexeme.getLength();
 46 |             this.payloadLength += lexeme.getLength();
 47 |             return true;
 48 | 
 49 |         } else if (this.checkCross(lexeme)) {
 50 |             return false;
 51 | 
 52 |         } else {
 53 |             this.addLexeme(lexeme);
 54 |             this.payloadLength += lexeme.getLength();
 55 |             Lexeme head = this.peekFirst();
 56 |             this.pathBegin = head.getBegin();
 57 |             Lexeme tail = this.peekLast();
 58 |             this.pathEnd = tail.getBegin() + tail.getLength();
 59 |             return true;
 60 | 
 61 |         }
 62 |     }
 63 | 
 64 |     Lexeme removeTail() {
 65 |         Lexeme tail = this.pollLast();
 66 |         if (this.isEmpty()) {
 67 |             this.pathBegin = -1;
 68 |             this.pathEnd = -1;
 69 |             this.payloadLength = 0;
 70 |         } else {
 71 |             this.payloadLength -= tail.getLength();
 72 |             Lexeme newTail = this.peekLast();
 73 |             this.pathEnd = newTail.getBegin() + newTail.getLength();
 74 |         }
 75 |         return tail;
 76 |     }
 77 | 
 78 |     boolean checkCross(Lexeme lexeme) {
 79 |         return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
 80 |                 || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin() + lexeme.getLength());
 81 |     }
 82 | 
 83 |     int getPathBegin() {
 84 |         return pathBegin;
 85 |     }
 86 | 
 87 |     int getPathEnd() {
 88 |         return pathEnd;
 89 |     }
 90 | 
 91 |     int getPayloadLength() {
 92 |         return this.payloadLength;
 93 |     }
 94 | 
 95 |     int getPathLength() {
 96 |         return this.pathEnd - this.pathBegin;
 97 |     }
 98 | 
 99 | 
100 |     int getXWeight() {
101 |         int product = 1;
102 |         Cell c = this.getHead();
103 |         while (c != null && c.getLexeme() != null) {
104 |             product *= c.getLexeme().getLength();
105 |             c = c.getNext();
106 |         }
107 |         return product;
108 |     }
109 | 
110 |     int getPWeight() {
111 |         int pWeight = 0;
112 |         int p = 0;
113 |         Cell c = this.getHead();
114 |         while (c != null && c.getLexeme() != null) {
115 |             p++;
116 |             pWeight += p * c.getLexeme().getLength();
117 |             c = c.getNext();
118 |         }
119 |         return pWeight;
120 |     }
121 | 
122 |     LexemePath copy() {
123 |         LexemePath theCopy = new LexemePath();
124 |         theCopy.pathBegin = this.pathBegin;
125 |         theCopy.pathEnd = this.pathEnd;
126 |         theCopy.payloadLength = this.payloadLength;
127 |         Cell c = this.getHead();
128 |         while (c != null && c.getLexeme() != null) {
129 |             theCopy.addLexeme(c.getLexeme());
130 |             c = c.getNext();
131 |         }
132 |         return theCopy;
133 |     }
134 | 
135 |     public int compareTo(LexemePath o) {
136 |         //比较有效文本长度
137 |         if (this.payloadLength > o.payloadLength) {
138 |             return -1;
139 |         } else if (this.payloadLength < o.payloadLength) {
140 |             return 1;
141 |         } else {
142 |             //比较词元个数，越少越好
143 |             if (this.size() < o.size()) {
144 |                 return -1;
145 |             } else if (this.size() > o.size()) {
146 |                 return 1;
147 |             } else {
148 |                 //路径跨度越大越好
149 |                 if (this.getPathLength() > o.getPathLength()) {
150 |                     return -1;
151 |                 } else if (this.getPathLength() < o.getPathLength()) {
152 |                     return 1;
153 |                 } else {
154 |                     //根据统计学结论，逆向切分概率高于正向切分，因此位置越靠后的优先
155 |                     if (this.pathEnd > o.pathEnd) {
156 |                         return -1;
157 |                     } else if (pathEnd < o.pathEnd) {
158 |                         return 1;
159 |                     } else {
160 |                         //词长越平均越好
161 |                         if (this.getXWeight() > o.getXWeight()) {
162 |                             return -1;
163 |                         } else if (this.getXWeight() < o.getXWeight()) {
164 |                             return 1;
165 |                         } else {
166 |                             //词元位置权重比较
167 |                             if (this.getPWeight() > o.getPWeight()) {
168 |                                 return -1;
169 |                             } else if (this.getPWeight() < o.getPWeight()) {
170 |                                 return 1;
171 |                             }
172 | 
173 |                         }
174 |                     }
175 |                 }
176 |             }
177 |         }
178 |         return 0;
179 |     }
180 | 
181 |     public String toString() {
182 |         StringBuffer sb = new StringBuffer();
183 |         sb.append("pathBegin  : ").append(pathBegin).append("\r\n");
184 |         sb.append("pathEnd  : ").append(pathEnd).append("\r\n");
185 |         sb.append("payloadLength  : ").append(payloadLength).append("\r\n");
186 |         Cell head = this.getHead();
187 |         while (head != null) {
188 |             sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
189 |             head = head.getNext();
190 |         }
191 |         return sb.toString();
192 |     }
193 | 
194 | }
195 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/core/QuickSortSet.java:
--------------------------------------------------------------------------------
  1 | package org.wltea.analyzer.core;
  2 | 
  3 | class QuickSortSet {
  4 | 	//链表头
  5 | 	private Cell head;
  6 | 	//链表尾
  7 | 	private Cell tail;
  8 | 	//链表的实际大小
  9 | 	private int size;
 10 | 	
 11 | 	QuickSortSet(){
 12 | 		this.size = 0;
 13 | 	}
 14 | 	
 15 | 	boolean addLexeme(Lexeme lexeme){
 16 | 		Cell newCell = new Cell(lexeme); 
 17 | 		if(this.size == 0){
 18 | 			this.head = newCell;
 19 | 			this.tail = newCell;
 20 | 			this.size++;
 21 | 			return true;
 22 | 			
 23 | 		}else{
 24 | 			if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同，不放入集合
 25 | 				return false;
 26 | 				
 27 | 			}else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
 28 | 				this.tail.next = newCell;
 29 | 				newCell.prev = this.tail;
 30 | 				this.tail = newCell;
 31 | 				this.size++;
 32 | 				return true;
 33 | 				
 34 | 			}else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
 35 | 				this.head.prev = newCell;
 36 | 				newCell.next = this.head;
 37 | 				this.head = newCell;
 38 | 				this.size++;
 39 | 				return true;
 40 | 				
 41 | 			}else{					
 42 | 				//从尾部上逆
 43 | 				Cell index = this.tail;
 44 | 				while(index != null && index.compareTo(newCell) > 0){
 45 | 					index = index.prev;
 46 | 				}
 47 | 				if(index.compareTo(newCell) == 0){//词元与集合中的词元重复，不放入集合
 48 | 					return false;
 49 | 					
 50 | 				}else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置
 51 | 					newCell.prev = index;
 52 | 					newCell.next = index.next;
 53 | 					index.next.prev = newCell;
 54 | 					index.next = newCell;
 55 | 					this.size++;
 56 | 					return true;					
 57 | 				}
 58 | 			}
 59 | 		}
 60 | 		return false;
 61 | 	}
 62 | 	
 63 | 	Lexeme peekFirst(){
 64 | 		if(this.head != null){
 65 | 			return this.head.lexeme;
 66 | 		}
 67 | 		return null;
 68 | 	}
 69 | 	
 70 | 	Lexeme pollFirst(){
 71 | 		if(this.size == 1){
 72 | 			Lexeme first = this.head.lexeme;
 73 | 			this.head = null;
 74 | 			this.tail = null;
 75 | 			this.size--;
 76 | 			return first;
 77 | 		}else if(this.size > 1){
 78 | 			Lexeme first = this.head.lexeme;
 79 | 			this.head = this.head.next;
 80 | 			this.size --;
 81 | 			return first;
 82 | 		}else{
 83 | 			return null;
 84 | 		}
 85 | 	}
 86 | 	
 87 | 	Lexeme peekLast(){
 88 | 		if(this.tail != null){
 89 | 			return this.tail.lexeme;
 90 | 		}
 91 | 		return null;
 92 | 	}
 93 | 	
 94 | 	Lexeme pollLast(){
 95 | 		if(this.size == 1){
 96 | 			Lexeme last = this.head.lexeme;
 97 | 			this.head = null;
 98 | 			this.tail = null;
 99 | 			this.size--;
100 | 			return last;
101 | 			
102 | 		}else if(this.size > 1){
103 | 			Lexeme last = this.tail.lexeme;
104 | 			this.tail = this.tail.prev;
105 | 			this.size--;
106 | 			return last;
107 | 			
108 | 		}else{
109 | 			return null;
110 | 		}
111 | 	}
112 | 	
113 | 	int size(){
114 | 		return this.size;
115 | 	}
116 | 	
117 | 	boolean isEmpty(){
118 | 		return this.size == 0;
119 | 	}
120 | 	
121 | 	Cell getHead(){
122 | 		return this.head;
123 | 	}
124 | 	
125 | 	class Cell implements Comparable<Cell>{
126 | 		private Cell prev;
127 | 		private Cell next;
128 | 		private Lexeme lexeme;
129 | 		
130 | 		Cell(Lexeme lexeme){
131 | 			if(lexeme == null){
132 | 				throw new IllegalArgumentException("lexeme must not be null");
133 | 			}
134 | 			this.lexeme = lexeme;
135 | 		}
136 | 
137 | 		public int compareTo(Cell o) {
138 | 			return this.lexeme.compareTo(o.lexeme);
139 | 		}
140 | 
141 | 		public Cell getPrev(){
142 | 			return this.prev;
143 | 		}
144 | 		
145 | 		public Cell getNext(){
146 | 			return this.next;
147 | 		}
148 | 		
149 | 		public Lexeme getLexeme(){
150 | 			return this.lexeme;
151 | 		}
152 | 	}
153 | }
154 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/dic/DictSegment.java:
--------------------------------------------------------------------------------
  1 | package org.wltea.analyzer.dic;
  2 | 
  3 | import java.util.Arrays;
  4 | import java.util.HashMap;
  5 | import java.util.Map;
  6 | 
  7 | class DictSegment implements Comparable<DictSegment>{
  8 | 	
  9 | 	//公用字典表，存储汉字
 10 | 	private static final Map<Character , Character> charMap = new HashMap<Character , Character>(16 , 0.95f);
 11 | 	//数组大小上限
 12 | 	private static final int ARRAY_LENGTH_LIMIT = 3;
 13 | 
 14 | 	
 15 | 	//Map存储结构
 16 | 	private Map<Character , DictSegment> childrenMap;
 17 | 	//数组方式存储结构
 18 | 	private DictSegment[] childrenArray;
 19 | 	
 20 | 	
 21 | 	//当前节点上存储的字符
 22 | 	private Character nodeChar;
 23 | 	//当前节点存储的Segment数目
 24 | 	//storeSize <=ARRAY_LENGTH_LIMIT ，使用数组存储， storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
 25 | 	private int storeSize = 0;
 26 | 	//当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
 27 | 	private int nodeState = 0;	
 28 | 	
 29 | 	
 30 | 	DictSegment(Character nodeChar){
 31 | 		if(nodeChar == null){
 32 | 			throw new IllegalArgumentException("参数为空异常，字符不能为空");
 33 | 		}
 34 | 		this.nodeChar = nodeChar;
 35 | 	}
 36 | 
 37 | 	Character getNodeChar() {
 38 | 		return nodeChar;
 39 | 	}
 40 | 	
 41 | 	/*
 42 | 	 * 判断是否有下一个节点
 43 | 	 */
 44 | 	boolean hasNextNode(){
 45 | 		return  this.storeSize > 0;
 46 | 	}
 47 | 	
 48 | 	Hit match(char[] charArray){
 49 | 		return this.match(charArray , 0 , charArray.length , null);
 50 | 	}
 51 | 	
 52 | 	Hit match(char[] charArray , int begin , int length){
 53 | 		return this.match(charArray , begin , length , null);
 54 | 	}
 55 | 	
 56 | 	Hit match(char[] charArray , int begin , int length , Hit searchHit){
 57 | 		
 58 | 		if(searchHit == null){
 59 | 			//如果hit为空，新建
 60 | 			searchHit= new Hit();
 61 | 			//设置hit的其实文本位置
 62 | 			searchHit.setBegin(begin);
 63 | 		}else{
 64 | 			//否则要将HIT状态重置
 65 | 			searchHit.setUnmatch();
 66 | 		}
 67 | 		//设置hit的当前处理位置
 68 | 		searchHit.setEnd(begin);
 69 | 		
 70 | 		Character keyChar = new Character(charArray[begin]);
 71 | 		DictSegment ds = null;
 72 | 		
 73 | 		//引用实例变量为本地变量，避免查询时遇到更新的同步问题
 74 | 		DictSegment[] segmentArray = this.childrenArray;
 75 | 		Map<Character , DictSegment> segmentMap = this.childrenMap;		
 76 | 		
 77 | 		//STEP1 在节点中查找keyChar对应的DictSegment
 78 | 		if(segmentArray != null){
 79 | 			//在数组中查找
 80 | 			DictSegment keySegment = new DictSegment(keyChar);
 81 | 			int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment);
 82 | 			if(position >= 0){
 83 | 				ds = segmentArray[position];
 84 | 			}
 85 | 
 86 | 		}else if(segmentMap != null){
 87 | 			//在map中查找
 88 | 			ds = segmentMap.get(keyChar);
 89 | 		}
 90 | 		
 91 | 		//STEP2 找到DictSegment，判断词的匹配状态，是否继续递归，还是返回结果
 92 | 		if(ds != null){			
 93 | 			if(length > 1){
 94 | 				//词未匹配完，继续往下搜索
 95 | 				return ds.match(charArray, begin + 1 , length - 1 , searchHit);
 96 | 			}else if (length == 1){
 97 | 				
 98 | 				//搜索最后一个char
 99 | 				if(ds.nodeState == 1){
100 | 					//添加HIT状态为完全匹配
101 | 					searchHit.setMatch();
102 | 				}
103 | 				if(ds.hasNextNode()){
104 | 					//添加HIT状态为前缀匹配
105 | 					searchHit.setPrefix();
106 | 					//记录当前位置的DictSegment
107 | 					searchHit.setMatchedDictSegment(ds);
108 | 				}
109 | 				return searchHit;
110 | 			}
111 | 			
112 | 		}
113 | 		//STEP3 没有找到DictSegment， 将HIT设置为不匹配
114 | 		return searchHit;		
115 | 	}
116 | 
117 | 	void fillSegment(char[] charArray){
118 | 		this.fillSegment(charArray, 0 , charArray.length , 1); 
119 | 	}
120 | 	
121 | 	void disableSegment(char[] charArray){
122 | 		this.fillSegment(charArray, 0 , charArray.length , 0); 
123 | 	}
124 | 	
125 | 	private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){
126 | 		//获取字典表中的汉字对象
127 | 		Character beginChar = new Character(charArray[begin]);
128 | 		Character keyChar = charMap.get(beginChar);
129 | 		//字典中没有该字，则将其添加入字典
130 | 		if(keyChar == null){
131 | 			charMap.put(beginChar, beginChar);
132 | 			keyChar = beginChar;
133 | 		}
134 | 		
135 | 		//搜索当前节点的存储，查询对应keyChar的keyChar，如果没有则创建
136 | 		DictSegment ds = lookforSegment(keyChar , enabled);
137 | 		if(ds != null){
138 | 			//JSONUtils
139 | 			if(length > 1){
140 | 				//词元还没有完全加入词典树
141 | 				ds.fillSegment(charArray, begin + 1, length - 1 , enabled);
142 | 			}else if (length == 1){
143 | 				//已经是词元的最后一个char,设置当前节点状态为enabled，
144 | 				//enabled=1表明一个完整的词，enabled=0表示从词典中屏蔽当前词
145 | 				ds.nodeState = enabled;
146 | 			}
147 | 		}
148 | 
149 | 	}
150 | 	
151 | 	private DictSegment lookforSegment(Character keyChar ,  int create){
152 | 		
153 | 		DictSegment ds = null;
154 | 
155 | 		if(this.storeSize <= ARRAY_LENGTH_LIMIT){
156 | 			//获取数组容器，如果数组未创建则创建数组
157 | 			DictSegment[] segmentArray = getChildrenArray();			
158 | 			//搜寻数组
159 | 			DictSegment keySegment = new DictSegment(keyChar);
160 | 			int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment);
161 | 			if(position >= 0){
162 | 				ds = segmentArray[position];
163 | 			}
164 | 		
165 | 			//遍历数组后没有找到对应的segment
166 | 			if(ds == null && create == 1){
167 | 				ds = keySegment;
168 | 				if(this.storeSize < ARRAY_LENGTH_LIMIT){
169 | 					//数组容量未满，使用数组存储
170 | 					segmentArray[this.storeSize] = ds;
171 | 					//segment数目+1
172 | 					this.storeSize++;
173 | 					Arrays.sort(segmentArray , 0 , this.storeSize);
174 | 					
175 | 				}else{
176 | 					//数组容量已满，切换Map存储
177 | 					//获取Map容器，如果Map未创建,则创建Map
178 | 					Map<Character , DictSegment> segmentMap = getChildrenMap();
179 | 					//将数组中的segment迁移到Map中
180 | 					migrate(segmentArray ,  segmentMap);
181 | 					//存储新的segment
182 | 					segmentMap.put(keyChar, ds);
183 | 					//segment数目+1 ，  必须在释放数组前执行storeSize++ ， 确保极端情况下，不会取到空的数组
184 | 					this.storeSize++;
185 | 					//释放当前的数组引用
186 | 					this.childrenArray = null;
187 | 				}
188 | 
189 | 			}			
190 | 			
191 | 		}else{
192 | 			//获取Map容器，如果Map未创建,则创建Map
193 | 			Map<Character , DictSegment> segmentMap = getChildrenMap();
194 | 			//搜索Map
195 | 			ds = segmentMap.get(keyChar);
196 | 			if(ds == null && create == 1){
197 | 				//构造新的segment
198 | 				ds = new DictSegment(keyChar);
199 | 				segmentMap.put(keyChar , ds);
200 | 				//当前节点存储segment数目+1
201 | 				this.storeSize ++;
202 | 			}
203 | 		}
204 | 
205 | 		return ds;
206 | 	}
207 | 	
208 | 	
209 | 	private DictSegment[] getChildrenArray(){
210 | 		if(this.childrenArray == null){
211 | 			synchronized(this){
212 | 				if(this.childrenArray == null){
213 | 					this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
214 | 				}
215 | 			}
216 | 		}
217 | 		return this.childrenArray;
218 | 	}
219 | 	
220 | 	private Map<Character , DictSegment> getChildrenMap(){
221 | 		if(this.childrenMap == null){
222 | 			synchronized(this){
223 | 				if(this.childrenMap == null){
224 | 					this.childrenMap = new HashMap<Character , DictSegment>(ARRAY_LENGTH_LIMIT * 2,0.8f);
225 | 				}
226 | 			}
227 | 		}
228 | 		return this.childrenMap;
229 | 	}
230 | 	
231 | 	private void migrate(DictSegment[] segmentArray , Map<Character , DictSegment> segmentMap){
232 | 		for(DictSegment segment : segmentArray){
233 | 			if(segment != null){
234 | 				segmentMap.put(segment.nodeChar, segment);
235 | 			}
236 | 		}
237 | 	}
238 | 
239 | 	public int compareTo(DictSegment o) {
240 | 		//对当前节点存储的char进行比较
241 | 		return this.nodeChar.compareTo(o.nodeChar);
242 | 	}
243 | 	
244 | }
245 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/dic/Dictionary.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0
  3 |  * IK Analyzer release 5.0
  4 |  * 
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  * 
 24 |  * 
 25 |  */
 26 | package org.wltea.analyzer.dic;
 27 | 
 28 | import java.util.Collection;
 29 | 
 30 | import org.wltea.analyzer.configuration.DictionaryConfiguration;
 31 | 
 32 | /**
 33 |  * 词典管理类,单子模式
 34 |  */
 35 | public class Dictionary {
 36 | 
 37 | 	private static Dictionary singleton;
 38 | 	
 39 | 	private DictSegment _MainDict;
 40 | 	
 41 | 	/*
 42 | 	 * 停止词词典 
 43 | 	 */
 44 | 	private DictSegment _StopWordDict;
 45 | 	/*
 46 | 	 * 量词词典
 47 | 	 */
 48 | 	private DictSegment _QuantifierDict;
 49 | 	
 50 | 	/**
 51 | 	 * 配置对象
 52 | 	 */
 53 | 	private DictionaryConfiguration cfg;
 54 | 	
 55 | 	private Dictionary(DictionaryConfiguration cfg){
 56 | 		this.cfg = cfg;
 57 | 		this.loadMainDict();
 58 | 		this.loadStopWordDict();
 59 | 		this.loadQuantifierDict();
 60 | 	}
 61 | 	
 62 | 	public static Dictionary initial(DictionaryConfiguration cfg){
 63 | 		if(singleton == null){
 64 | 			synchronized(Dictionary.class){
 65 | 				if(singleton == null){
 66 | 					singleton = new Dictionary(cfg);
 67 | 					return singleton;
 68 | 				}
 69 | 			}
 70 | 		}
 71 | 		return singleton;
 72 | 	}
 73 | 	
 74 | 	public static Dictionary getSingleton(){
 75 | 		if(singleton == null){
 76 | 			throw new IllegalStateException("词典尚未初始化，请先调用initial方法");
 77 | 		}
 78 | 		return singleton;
 79 | 	}
 80 | 	
 81 | 	public void addWords(Collection<String> words){
 82 | 		if(words != null){
 83 | 			for(String word : words){
 84 | 				if (word != null) {
 85 | 					//批量加载词条到主内存词典中
 86 | 					singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
 87 | 				}
 88 | 			}
 89 | 		}
 90 | 	}
 91 | 	
 92 | 	public void disableWords(Collection<String> words){
 93 | 		if(words != null){
 94 | 			for(String word : words){
 95 | 				if (word != null) {
 96 | 					//批量屏蔽词条
 97 | 					singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
 98 | 				}
 99 | 			}
100 | 		}
101 | 	}
102 | 	
103 | 	public Hit matchInMainDict(char[] charArray){
104 | 		return singleton._MainDict.match(charArray);
105 | 	}
106 | 	
107 | 	public Hit matchInMainDict(char[] charArray , int begin, int length){
108 | 		return singleton._MainDict.match(charArray, begin, length);
109 | 	}
110 | 	
111 | 	public Hit matchInQuantifierDict(char[] charArray , int begin, int length){
112 | 		return singleton._QuantifierDict.match(charArray, begin, length);
113 | 	}
114 | 	
115 | 	
116 | 	public Hit matchWithHit(char[] charArray , int currentIndex , Hit matchedHit){
117 | 		DictSegment ds = matchedHit.getMatchedDictSegment();
118 | 		return ds.match(charArray, currentIndex, 1 , matchedHit);
119 | 	}
120 | 	
121 | 	
122 | 	public boolean isStopWord(char[] charArray , int begin, int length){
123 | 		return singleton._StopWordDict.match(charArray, begin, length).isMatch();
124 | 	}	
125 | 	
126 | 	/**
127 | 	 * 加载主词典及扩展词典
128 | 	 */
129 | 	private void loadMainDict() {
130 |         //建立一个主词典实例
131 |         _MainDict = new DictSegment((char) 0);
132 |         for (char[] segment : cfg.getMainDictionary()) {
133 |             _MainDict.fillSegment(segment);
134 | 
135 |         }
136 |     }
137 | 
138 |     /**
139 | 	 * 加载用户扩展的停止词词典
140 | 	 */
141 | 	private void loadStopWordDict(){
142 | 		//建立一个主词典实例
143 | 		_StopWordDict = new DictSegment((char)0);
144 |         for (char[] segment : cfg.getStopWordDictionary()) {
145 |             _StopWordDict.fillSegment(segment);
146 |         }
147 | 
148 | 	}
149 | 	
150 | 	/**
151 | 	 * 加载量词词典
152 | 	 */
153 | 	private void loadQuantifierDict(){
154 | 		//建立一个量词典实例
155 | 		_QuantifierDict = new DictSegment((char)0);
156 |         for (char[] segment : cfg.getQuantifierDictionary()) {
157 |             _QuantifierDict.fillSegment(segment);
158 |         }
159 | 
160 | 	}
161 | 	
162 | }
163 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/main/java/org/wltea/analyzer/dic/Hit.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * 
  3 |  * IK 中文分词  版本 5.0
  4 |  * IK Analyzer release 5.0
  5 |  * 
  6 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  7 |  * contributor license agreements.  See the NOTICE file distributed with
  8 |  * this work for additional information regarding copyright ownership.
  9 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 10 |  * (the "License"); you may not use this file except in compliance with
 11 |  * the License.  You may obtain a copy of the License at
 12 |  *
 13 |  *     http://www.apache.org/licenses/LICENSE-2.0
 14 |  *
 15 |  * Unless required by applicable law or agreed to in writing, software
 16 |  * distributed under the License is distributed on an "AS IS" BASIS,
 17 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 18 |  * See the License for the specific language governing permissions and
 19 |  * limitations under the License.
 20 |  *
 21 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 22 |  * 版权声明 2012，乌龙茶工作室
 23 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 24 |  * 
 25 |  */
 26 | package org.wltea.analyzer.dic;
 27 | 
 28 | /*
 29 |  * 表示一次词典匹配的命中
 30 |  */
 31 | public class Hit {
 32 | 	//Hit不匹配
 33 | 	private static final int UNMATCH = 0x00000000;
 34 | 	//Hit完全匹配
 35 | 	private static final int MATCH = 0x00000001;
 36 | 	//Hit前缀匹配
 37 | 	private static final int PREFIX = 0x00000010;
 38 | 	
 39 | 	
 40 | 	//该HIT当前状态，默认未匹配
 41 | 	private int hitState = UNMATCH;
 42 | 	
 43 | 	//记录词典匹配过程中，当前匹配到的词典分支节点
 44 | 	private DictSegment matchedDictSegment; 
 45 | 	/*
 46 | 	 * 词段开始位置
 47 | 	 */
 48 | 	private int begin;
 49 | 	/*
 50 | 	 * 词段的结束位置
 51 | 	 */
 52 | 	private int end;
 53 | 	
 54 | 	
 55 | 	/*
 56 | 	 * 判断是否完全匹配
 57 | 	 */
 58 | 	public boolean isMatch() {
 59 | 		return (this.hitState & MATCH) > 0;
 60 | 	}
 61 | 	/*
 62 | 	 * 
 63 | 	 */
 64 | 	public void setMatch() {
 65 | 		this.hitState = this.hitState | MATCH;
 66 | 	}
 67 | 
 68 | 	/*
 69 | 	 * 判断是否是词的前缀
 70 | 	 */
 71 | 	public boolean isPrefix() {
 72 | 		return (this.hitState & PREFIX) > 0;
 73 | 	}
 74 | 	/*
 75 | 	 * 
 76 | 	 */
 77 | 	public void setPrefix() {
 78 | 		this.hitState = this.hitState | PREFIX;
 79 | 	}
 80 | 	/*
 81 | 	 * 判断是否是不匹配
 82 | 	 */
 83 | 	public boolean isUnmatch() {
 84 | 		return this.hitState == UNMATCH ;
 85 | 	}
 86 | 	/*
 87 | 	 * 
 88 | 	 */
 89 | 	public void setUnmatch() {
 90 | 		this.hitState = UNMATCH;
 91 | 	}
 92 | 	
 93 | 	public DictSegment getMatchedDictSegment() {
 94 | 		return matchedDictSegment;
 95 | 	}
 96 | 	
 97 | 	public void setMatchedDictSegment(DictSegment matchedDictSegment) {
 98 | 		this.matchedDictSegment = matchedDictSegment;
 99 | 	}
100 | 	
101 | 	public int getBegin() {
102 | 		return begin;
103 | 	}
104 | 	
105 | 	public void setBegin(int begin) {
106 | 		this.begin = begin;
107 | 	}
108 | 	
109 | 	public int getEnd() {
110 | 		return end;
111 | 	}
112 | 	
113 | 	public void setEnd(int end) {
114 | 		this.end = end;
115 | 	}	
116 | 	
117 | }
118 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/test/java/org/wltea/analyzer/IKSegmenterTest.java:
--------------------------------------------------------------------------------
 1 | package org.wltea.analyzer;
 2 | 
 3 | import org.junit.Assert;
 4 | import org.junit.Test;
 5 | import org.wltea.analyzer.core.IKSegmenter;
 6 | import org.wltea.analyzer.core.Lexeme;
 7 | 
 8 | import java.io.Reader;
 9 | import java.io.StringReader;
10 | 
11 | public class IKSegmenterTest {
12 | 
13 |     @Test
14 |     public void testSegment() throws Exception {
15 |         Reader in = new StringReader("一一分 准确值就是它们听上去的那样。干柴诸如日期或用户ID。当然字符串也可以是准确值，如用户名或邮件地址。准确值Foo与准确值foo是不同的。准确值2014和准确值2014-09-15也是不同的。测试");
16 |         boolean useSmart = true;
17 |         IKSegmenter segmenter = new IKSegmenter(in, MockDictionary.smartModeSqlite3Configure());
18 | 
19 |         assertSegmenterCorrect(segmenter.next(), "一一分", 0, 3, 3, "CN_WORD");
20 |         assertSegmenterCorrect(segmenter.next(), "准确值", 4, 7, 3, "CN_WORD");
21 |         assertSegmenterCorrect(segmenter.next(), "听", 11, 12, 1, "CN_WORD");
22 |         assertSegmenterCorrect(segmenter.next(), "上去", 12, 14, 2, "CN_WORD");
23 |         assertSegmenterCorrect(segmenter.next(), "干柴", 18, 20, 2, "CN_WORD");
24 |         assertSegmenterCorrect(segmenter.next(), "诸如", 20, 22, 2, "CN_WORD");
25 |         assertSegmenterCorrect(segmenter.next(), "日期", 22, 24, 2, "CN_WORD");
26 |         assertSegmenterCorrect(segmenter.next(), "用户", 25, 27, 2, "CN_WORD");
27 |         assertSegmenterCorrect(segmenter.next(), "id", 27, 29, 2, "ENGLISH");
28 |         assertSegmenterCorrect(segmenter.next(), "当然", 30, 32, 2, "CN_WORD");
29 |         assertSegmenterCorrect(segmenter.next(), "字符串", 32, 35, 3, "CN_WORD");
30 |         assertSegmenterCorrect(segmenter.next(), "以是", 37, 39, 2, "CN_WORD");
31 |         assertSegmenterCorrect(segmenter.next(), "准确值", 39, 42, 3, "CN_WORD");
32 |         assertSegmenterCorrect(segmenter.next(), "用户名", 44, 47, 3, "CN_WORD");
33 |         assertSegmenterCorrect(segmenter.next(), "邮件地址", 48, 52, 4, "CN_WORD");
34 |         assertSegmenterCorrect(segmenter.next(), "准确值", 53, 56, 3, "CN_WORD");
35 |         assertSegmenterCorrect(segmenter.next(), "foo", 56, 59, 3, "ENGLISH");
36 |         assertSegmenterCorrect(segmenter.next(), "准确值", 60, 63, 3, "CN_WORD");
37 |         assertSegmenterCorrect(segmenter.next(), "foo", 63, 66, 3, "ENGLISH");
38 |         assertSegmenterCorrect(segmenter.next(), "不同", 67, 69, 2, "CN_WORD");
39 |         assertSegmenterCorrect(segmenter.next(), "准确值", 71, 74, 3, "CN_WORD");
40 |         assertSegmenterCorrect(segmenter.next(), "2014", 74, 78, 4, "ARABIC");
41 |         assertSegmenterCorrect(segmenter.next(), "准确值", 79, 82, 3, "CN_WORD");
42 |         assertSegmenterCorrect(segmenter.next(), "2014-09-15", 82, 92, 10, "LETTER");
43 |         assertSegmenterCorrect(segmenter.next(), "也是", 92, 94, 2, "CN_WORD");
44 |         assertSegmenterCorrect(segmenter.next(), "不同", 94, 96, 2, "CN_WORD");
45 |         assertSegmenterCorrect(segmenter.next(), "测试", 98, 100, 2, "CN_WORD");
46 |     }
47 | 
48 |     private void assertSegmenterCorrect(Lexeme nextLexeme, String lexemeText, int begin, int end, int length, String type) {
49 |         Assert.assertEquals(nextLexeme.getLexemeText(), lexemeText);
50 |         Assert.assertEquals(nextLexeme.getBeginPosition(), begin);
51 |         Assert.assertEquals(nextLexeme.getEndPosition(), end);
52 |         Assert.assertEquals(nextLexeme.getLength(), length);
53 |         Assert.assertEquals(nextLexeme.getLexemeTypeString(), type);
54 | 
55 |     }
56 | 
57 |     private void print(Lexeme nextLexeme){
58 |         System.out.println(nextLexeme.getLexemeText());
59 |         System.out.println(nextLexeme.getBeginPosition());
60 |         System.out.println(nextLexeme.getEndPosition());
61 |         System.out.println(nextLexeme.getLength());
62 |         System.out.println(nextLexeme.getLexemeTypeString());
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/ik-analysis-core/src/test/java/org/wltea/analyzer/MockDictionary.java:
--------------------------------------------------------------------------------
   1 | package org.wltea.analyzer;
   2 | 
   3 | 
   4 | import org.wltea.analyzer.configuration.DictionaryConfiguration;
   5 | 
   6 | import java.util.ArrayList;
   7 | import java.util.List;
   8 | 
   9 | public class MockDictionary implements DictionaryConfiguration {
  10 | 
  11 |     private final List<char[]> mainDictionary;
  12 |     private final List<char[]> quantifierDictionary;
  13 |     private final List<char[]> stopWordDictionary;
  14 | 
  15 | 
  16 |     private boolean smartMode = true;
  17 | 
  18 | 
  19 |     private MockDictionary() {
  20 | 
  21 |         mainDictionary = new ArrayList<char[]>();
  22 |         quantifierDictionary = new ArrayList<char[]>();
  23 |         stopWordDictionary = new ArrayList<char[]>();
  24 | 
  25 | 
  26 |         initStopWordDictionary(stopWordDictionary);
  27 | 
  28 |         mainDictionary.add("一一分".toCharArray());
  29 |         mainDictionary.add("听".toCharArray());
  30 |         mainDictionary.add("上去".toCharArray());
  31 |         mainDictionary.add("那样".toCharArray());
  32 |         mainDictionary.add("干柴".toCharArray());
  33 |         mainDictionary.add("诸如".toCharArray());
  34 |         mainDictionary.add("日期".toCharArray());
  35 |         mainDictionary.add("用户".toCharArray());
  36 |         mainDictionary.add("当然".toCharArray());
  37 |         mainDictionary.add("字符串".toCharArray());
  38 |         mainDictionary.add("以是".toCharArray());
  39 |         mainDictionary.add("准确值".toCharArray());
  40 |         mainDictionary.add("用户名".toCharArray());
  41 |         mainDictionary.add("邮件地址".toCharArray());
  42 |         mainDictionary.add("准确值".toCharArray());
  43 |         mainDictionary.add("不同".toCharArray());
  44 |         mainDictionary.add("也是".toCharArray());
  45 |         mainDictionary.add("测试".toCharArray());
  46 |     }
  47 | 
  48 |     public static MockDictionary smartModeSqlite3Configure() {
  49 |         MockDictionary sqlite3Configure = new MockDictionary();
  50 |         sqlite3Configure.setSmartMode(true);
  51 |         return sqlite3Configure;
  52 |     }
  53 | 
  54 |     private void initStopWordDictionary(List<char[]> stopWordDictionary) {
  55 |         stopWordDictionary.add("a".toCharArray());
  56 |         stopWordDictionary.add("an".toCharArray());
  57 |         stopWordDictionary.add("and".toCharArray());
  58 |         stopWordDictionary.add("are".toCharArray());
  59 |         stopWordDictionary.add("as".toCharArray());
  60 |         stopWordDictionary.add("at".toCharArray());
  61 |         stopWordDictionary.add("be".toCharArray());
  62 |         stopWordDictionary.add("but".toCharArray());
  63 |         stopWordDictionary.add("by".toCharArray());
  64 |         stopWordDictionary.add("for".toCharArray());
  65 |         stopWordDictionary.add("if".toCharArray());
  66 |         stopWordDictionary.add("in".toCharArray());
  67 |         stopWordDictionary.add("into".toCharArray());
  68 |         stopWordDictionary.add("is".toCharArray());
  69 |         stopWordDictionary.add("it".toCharArray());
  70 |         stopWordDictionary.add("no".toCharArray());
  71 |         stopWordDictionary.add("not".toCharArray());
  72 |         stopWordDictionary.add("of".toCharArray());
  73 |         stopWordDictionary.add("on".toCharArray());
  74 |         stopWordDictionary.add("or".toCharArray());
  75 |         stopWordDictionary.add("such".toCharArray());
  76 |         stopWordDictionary.add("that".toCharArray());
  77 |         stopWordDictionary.add("the".toCharArray());
  78 |         stopWordDictionary.add("their".toCharArray());
  79 |         stopWordDictionary.add("then".toCharArray());
  80 |         stopWordDictionary.add("there".toCharArray());
  81 |         stopWordDictionary.add("these".toCharArray());
  82 |         stopWordDictionary.add("they".toCharArray());
  83 |         stopWordDictionary.add("this".toCharArray());
  84 |         stopWordDictionary.add("to".toCharArray());
  85 |         stopWordDictionary.add("was".toCharArray());
  86 |         stopWordDictionary.add("will".toCharArray());
  87 |         stopWordDictionary.add("with".toCharArray());
  88 |         stopWordDictionary.add("﻿更好的".toCharArray());
  89 |         stopWordDictionary.add("选择".toCharArray());
  90 |         stopWordDictionary.add("啊".toCharArray());
  91 |         stopWordDictionary.add("阿".toCharArray());
  92 |         stopWordDictionary.add("哎".toCharArray());
  93 |         stopWordDictionary.add("哎呀".toCharArray());
  94 |         stopWordDictionary.add("哎哟".toCharArray());
  95 |         stopWordDictionary.add("唉".toCharArray());
  96 |         stopWordDictionary.add("俺".toCharArray());
  97 |         stopWordDictionary.add("俺们".toCharArray());
  98 |         stopWordDictionary.add("按".toCharArray());
  99 |         stopWordDictionary.add("按照".toCharArray());
 100 |         stopWordDictionary.add("吧".toCharArray());
 101 |         stopWordDictionary.add("吧哒".toCharArray());
 102 |         stopWordDictionary.add("把".toCharArray());
 103 |         stopWordDictionary.add("罢了".toCharArray());
 104 |         stopWordDictionary.add("被".toCharArray());
 105 |         stopWordDictionary.add("本".toCharArray());
 106 |         stopWordDictionary.add("本着".toCharArray());
 107 |         stopWordDictionary.add("比".toCharArray());
 108 |         stopWordDictionary.add("比方".toCharArray());
 109 |         stopWordDictionary.add("比如".toCharArray());
 110 |         stopWordDictionary.add("鄙人".toCharArray());
 111 |         stopWordDictionary.add("彼".toCharArray());
 112 |         stopWordDictionary.add("彼此".toCharArray());
 113 |         stopWordDictionary.add("边".toCharArray());
 114 |         stopWordDictionary.add("别".toCharArray());
 115 |         stopWordDictionary.add("别的".toCharArray());
 116 |         stopWordDictionary.add("别说".toCharArray());
 117 |         stopWordDictionary.add("并".toCharArray());
 118 |         stopWordDictionary.add("并且".toCharArray());
 119 |         stopWordDictionary.add("不比".toCharArray());
 120 |         stopWordDictionary.add("不成".toCharArray());
 121 |         stopWordDictionary.add("不单".toCharArray());
 122 |         stopWordDictionary.add("不但".toCharArray());
 123 |         stopWordDictionary.add("不独".toCharArray());
 124 |         stopWordDictionary.add("不管".toCharArray());
 125 |         stopWordDictionary.add("不光".toCharArray());
 126 |         stopWordDictionary.add("不过".toCharArray());
 127 |         stopWordDictionary.add("不仅".toCharArray());
 128 |         stopWordDictionary.add("不拘".toCharArray());
 129 |         stopWordDictionary.add("不论".toCharArray());
 130 |         stopWordDictionary.add("不怕".toCharArray());
 131 |         stopWordDictionary.add("不然".toCharArray());
 132 |         stopWordDictionary.add("不如".toCharArray());
 133 |         stopWordDictionary.add("不特".toCharArray());
 134 |         stopWordDictionary.add("不惟".toCharArray());
 135 |         stopWordDictionary.add("不问".toCharArray());
 136 |         stopWordDictionary.add("不只".toCharArray());
 137 |         stopWordDictionary.add("朝".toCharArray());
 138 |         stopWordDictionary.add("朝着".toCharArray());
 139 |         stopWordDictionary.add("趁".toCharArray());
 140 |         stopWordDictionary.add("趁着".toCharArray());
 141 |         stopWordDictionary.add("乘".toCharArray());
 142 |         stopWordDictionary.add("冲".toCharArray());
 143 |         stopWordDictionary.add("除".toCharArray());
 144 |         stopWordDictionary.add("除此之外".toCharArray());
 145 |         stopWordDictionary.add("除非".toCharArray());
 146 |         stopWordDictionary.add("除了".toCharArray());
 147 |         stopWordDictionary.add("此".toCharArray());
 148 |         stopWordDictionary.add("此间".toCharArray());
 149 |         stopWordDictionary.add("此外".toCharArray());
 150 |         stopWordDictionary.add("从".toCharArray());
 151 |         stopWordDictionary.add("从而".toCharArray());
 152 |         stopWordDictionary.add("打".toCharArray());
 153 |         stopWordDictionary.add("待".toCharArray());
 154 |         stopWordDictionary.add("但".toCharArray());
 155 |         stopWordDictionary.add("但是".toCharArray());
 156 |         stopWordDictionary.add("当".toCharArray());
 157 |         stopWordDictionary.add("当着".toCharArray());
 158 |         stopWordDictionary.add("到".toCharArray());
 159 |         stopWordDictionary.add("得".toCharArray());
 160 |         stopWordDictionary.add("的".toCharArray());
 161 |         stopWordDictionary.add("的话".toCharArray());
 162 |         stopWordDictionary.add("等".toCharArray());
 163 |         stopWordDictionary.add("等等".toCharArray());
 164 |         stopWordDictionary.add("地".toCharArray());
 165 |         stopWordDictionary.add("第".toCharArray());
 166 |         stopWordDictionary.add("叮咚".toCharArray());
 167 |         stopWordDictionary.add("对".toCharArray());
 168 |         stopWordDictionary.add("对于".toCharArray());
 169 |         stopWordDictionary.add("多".toCharArray());
 170 |         stopWordDictionary.add("多少".toCharArray());
 171 |         stopWordDictionary.add("而".toCharArray());
 172 |         stopWordDictionary.add("而况".toCharArray());
 173 |         stopWordDictionary.add("而且".toCharArray());
 174 |         stopWordDictionary.add("而是".toCharArray());
 175 |         stopWordDictionary.add("而外".toCharArray());
 176 |         stopWordDictionary.add("而言".toCharArray());
 177 |         stopWordDictionary.add("而已".toCharArray());
 178 |         stopWordDictionary.add("尔后".toCharArray());
 179 |         stopWordDictionary.add("反过来".toCharArray());
 180 |         stopWordDictionary.add("反过来说".toCharArray());
 181 |         stopWordDictionary.add("反之".toCharArray());
 182 |         stopWordDictionary.add("非但".toCharArray());
 183 |         stopWordDictionary.add("非徒".toCharArray());
 184 |         stopWordDictionary.add("否则".toCharArray());
 185 |         stopWordDictionary.add("嘎".toCharArray());
 186 |         stopWordDictionary.add("嘎登".toCharArray());
 187 |         stopWordDictionary.add("该".toCharArray());
 188 |         stopWordDictionary.add("赶".toCharArray());
 189 |         stopWordDictionary.add("个".toCharArray());
 190 |         stopWordDictionary.add("各".toCharArray());
 191 |         stopWordDictionary.add("各个".toCharArray());
 192 |         stopWordDictionary.add("各位".toCharArray());
 193 |         stopWordDictionary.add("各种".toCharArray());
 194 |         stopWordDictionary.add("各自".toCharArray());
 195 |         stopWordDictionary.add("给".toCharArray());
 196 |         stopWordDictionary.add("根据".toCharArray());
 197 |         stopWordDictionary.add("跟".toCharArray());
 198 |         stopWordDictionary.add("故".toCharArray());
 199 |         stopWordDictionary.add("故此".toCharArray());
 200 |         stopWordDictionary.add("固然".toCharArray());
 201 |         stopWordDictionary.add("关于".toCharArray());
 202 |         stopWordDictionary.add("管".toCharArray());
 203 |         stopWordDictionary.add("归".toCharArray());
 204 |         stopWordDictionary.add("果然".toCharArray());
 205 |         stopWordDictionary.add("果真".toCharArray());
 206 |         stopWordDictionary.add("过".toCharArray());
 207 |         stopWordDictionary.add("哈".toCharArray());
 208 |         stopWordDictionary.add("哈哈".toCharArray());
 209 |         stopWordDictionary.add("呵".toCharArray());
 210 |         stopWordDictionary.add("和".toCharArray());
 211 |         stopWordDictionary.add("何".toCharArray());
 212 |         stopWordDictionary.add("何处".toCharArray());
 213 |         stopWordDictionary.add("何况".toCharArray());
 214 |         stopWordDictionary.add("何时".toCharArray());
 215 |         stopWordDictionary.add("嘿".toCharArray());
 216 |         stopWordDictionary.add("哼".toCharArray());
 217 |         stopWordDictionary.add("哼唷".toCharArray());
 218 |         stopWordDictionary.add("呼哧".toCharArray());
 219 |         stopWordDictionary.add("乎".toCharArray());
 220 |         stopWordDictionary.add("哗".toCharArray());
 221 |         stopWordDictionary.add("还是".toCharArray());
 222 |         stopWordDictionary.add("还有".toCharArray());
 223 |         stopWordDictionary.add("换句话说".toCharArray());
 224 |         stopWordDictionary.add("换言之".toCharArray());
 225 |         stopWordDictionary.add("或".toCharArray());
 226 |         stopWordDictionary.add("或是".toCharArray());
 227 |         stopWordDictionary.add("或者".toCharArray());
 228 |         stopWordDictionary.add("极了".toCharArray());
 229 |         stopWordDictionary.add("及".toCharArray());
 230 |         stopWordDictionary.add("及其".toCharArray());
 231 |         stopWordDictionary.add("及至".toCharArray());
 232 |         stopWordDictionary.add("即".toCharArray());
 233 |         stopWordDictionary.add("即便".toCharArray());
 234 |         stopWordDictionary.add("即或".toCharArray());
 235 |         stopWordDictionary.add("即令".toCharArray());
 236 |         stopWordDictionary.add("即若".toCharArray());
 237 |         stopWordDictionary.add("即使".toCharArray());
 238 |         stopWordDictionary.add("几".toCharArray());
 239 |         stopWordDictionary.add("几时".toCharArray());
 240 |         stopWordDictionary.add("己".toCharArray());
 241 |         stopWordDictionary.add("既".toCharArray());
 242 |         stopWordDictionary.add("既然".toCharArray());
 243 |         stopWordDictionary.add("既是".toCharArray());
 244 |         stopWordDictionary.add("继而".toCharArray());
 245 |         stopWordDictionary.add("加之".toCharArray());
 246 |         stopWordDictionary.add("假如".toCharArray());
 247 |         stopWordDictionary.add("假若".toCharArray());
 248 |         stopWordDictionary.add("假使".toCharArray());
 249 |         stopWordDictionary.add("鉴于".toCharArray());
 250 |         stopWordDictionary.add("将".toCharArray());
 251 |         stopWordDictionary.add("较".toCharArray());
 252 |         stopWordDictionary.add("较之".toCharArray());
 253 |         stopWordDictionary.add("叫".toCharArray());
 254 |         stopWordDictionary.add("接着".toCharArray());
 255 |         stopWordDictionary.add("结果".toCharArray());
 256 |         stopWordDictionary.add("借".toCharArray());
 257 |         stopWordDictionary.add("紧接着".toCharArray());
 258 |         stopWordDictionary.add("进而".toCharArray());
 259 |         stopWordDictionary.add("尽".toCharArray());
 260 |         stopWordDictionary.add("尽管".toCharArray());
 261 |         stopWordDictionary.add("经".toCharArray());
 262 |         stopWordDictionary.add("经过".toCharArray());
 263 |         stopWordDictionary.add("就".toCharArray());
 264 |         stopWordDictionary.add("就是".toCharArray());
 265 |         stopWordDictionary.add("就是说".toCharArray());
 266 |         stopWordDictionary.add("据".toCharArray());
 267 |         stopWordDictionary.add("具体地说".toCharArray());
 268 |         stopWordDictionary.add("具体说来".toCharArray());
 269 |         stopWordDictionary.add("开始".toCharArray());
 270 |         stopWordDictionary.add("开外".toCharArray());
 271 |         stopWordDictionary.add("靠".toCharArray());
 272 |         stopWordDictionary.add("咳".toCharArray());
 273 |         stopWordDictionary.add("可".toCharArray());
 274 |         stopWordDictionary.add("可见".toCharArray());
 275 |         stopWordDictionary.add("可是".toCharArray());
 276 |         stopWordDictionary.add("可以".toCharArray());
 277 |         stopWordDictionary.add("况且".toCharArray());
 278 |         stopWordDictionary.add("啦".toCharArray());
 279 |         stopWordDictionary.add("来".toCharArray());
 280 |         stopWordDictionary.add("来着".toCharArray());
 281 |         stopWordDictionary.add("离".toCharArray());
 282 |         stopWordDictionary.add("例如".toCharArray());
 283 |         stopWordDictionary.add("哩".toCharArray());
 284 |         stopWordDictionary.add("连".toCharArray());
 285 |         stopWordDictionary.add("连同".toCharArray());
 286 |         stopWordDictionary.add("两者".toCharArray());
 287 |         stopWordDictionary.add("了".toCharArray());
 288 |         stopWordDictionary.add("临".toCharArray());
 289 |         stopWordDictionary.add("另".toCharArray());
 290 |         stopWordDictionary.add("另外".toCharArray());
 291 |         stopWordDictionary.add("另一方面".toCharArray());
 292 |         stopWordDictionary.add("论".toCharArray());
 293 |         stopWordDictionary.add("嘛".toCharArray());
 294 |         stopWordDictionary.add("吗".toCharArray());
 295 |         stopWordDictionary.add("慢说".toCharArray());
 296 |         stopWordDictionary.add("漫说".toCharArray());
 297 |         stopWordDictionary.add("冒".toCharArray());
 298 |         stopWordDictionary.add("么".toCharArray());
 299 |         stopWordDictionary.add("每".toCharArray());
 300 |         stopWordDictionary.add("每当".toCharArray());
 301 |         stopWordDictionary.add("们".toCharArray());
 302 |         stopWordDictionary.add("莫若".toCharArray());
 303 |         stopWordDictionary.add("某".toCharArray());
 304 |         stopWordDictionary.add("某个".toCharArray());
 305 |         stopWordDictionary.add("某些".toCharArray());
 306 |         stopWordDictionary.add("拿".toCharArray());
 307 |         stopWordDictionary.add("哪".toCharArray());
 308 |         stopWordDictionary.add("哪边".toCharArray());
 309 |         stopWordDictionary.add("哪儿".toCharArray());
 310 |         stopWordDictionary.add("哪个".toCharArray());
 311 |         stopWordDictionary.add("哪里".toCharArray());
 312 |         stopWordDictionary.add("哪年".toCharArray());
 313 |         stopWordDictionary.add("哪怕".toCharArray());
 314 |         stopWordDictionary.add("哪天".toCharArray());
 315 |         stopWordDictionary.add("哪些".toCharArray());
 316 |         stopWordDictionary.add("哪样".toCharArray());
 317 |         stopWordDictionary.add("那".toCharArray());
 318 |         stopWordDictionary.add("那边".toCharArray());
 319 |         stopWordDictionary.add("那儿".toCharArray());
 320 |         stopWordDictionary.add("那个".toCharArray());
 321 |         stopWordDictionary.add("那会儿".toCharArray());
 322 |         stopWordDictionary.add("那里".toCharArray());
 323 |         stopWordDictionary.add("那么".toCharArray());
 324 |         stopWordDictionary.add("那么些".toCharArray());
 325 |         stopWordDictionary.add("那么样".toCharArray());
 326 |         stopWordDictionary.add("那时".toCharArray());
 327 |         stopWordDictionary.add("那些".toCharArray());
 328 |         stopWordDictionary.add("那样".toCharArray());
 329 |         stopWordDictionary.add("乃".toCharArray());
 330 |         stopWordDictionary.add("乃至".toCharArray());
 331 |         stopWordDictionary.add("呢".toCharArray());
 332 |         stopWordDictionary.add("能".toCharArray());
 333 |         stopWordDictionary.add("你".toCharArray());
 334 |         stopWordDictionary.add("你们".toCharArray());
 335 |         stopWordDictionary.add("您".toCharArray());
 336 |         stopWordDictionary.add("宁".toCharArray());
 337 |         stopWordDictionary.add("宁可".toCharArray());
 338 |         stopWordDictionary.add("宁肯".toCharArray());
 339 |         stopWordDictionary.add("宁愿".toCharArray());
 340 |         stopWordDictionary.add("哦".toCharArray());
 341 |         stopWordDictionary.add("呕".toCharArray());
 342 |         stopWordDictionary.add("啪达".toCharArray());
 343 |         stopWordDictionary.add("旁人".toCharArray());
 344 |         stopWordDictionary.add("呸".toCharArray());
 345 |         stopWordDictionary.add("凭".toCharArray());
 346 |         stopWordDictionary.add("凭借".toCharArray());
 347 |         stopWordDictionary.add("其".toCharArray());
 348 |         stopWordDictionary.add("其次".toCharArray());
 349 |         stopWordDictionary.add("其二".toCharArray());
 350 |         stopWordDictionary.add("其他".toCharArray());
 351 |         stopWordDictionary.add("其它".toCharArray());
 352 |         stopWordDictionary.add("其一".toCharArray());
 353 |         stopWordDictionary.add("其余".toCharArray());
 354 |         stopWordDictionary.add("其中".toCharArray());
 355 |         stopWordDictionary.add("起".toCharArray());
 356 |         stopWordDictionary.add("起见".toCharArray());
 357 |         stopWordDictionary.add("起见".toCharArray());
 358 |         stopWordDictionary.add("岂但".toCharArray());
 359 |         stopWordDictionary.add("恰恰相反".toCharArray());
 360 |         stopWordDictionary.add("前后".toCharArray());
 361 |         stopWordDictionary.add("前者".toCharArray());
 362 |         stopWordDictionary.add("且".toCharArray());
 363 |         stopWordDictionary.add("然而".toCharArray());
 364 |         stopWordDictionary.add("然后".toCharArray());
 365 |         stopWordDictionary.add("然则".toCharArray());
 366 |         stopWordDictionary.add("让".toCharArray());
 367 |         stopWordDictionary.add("人家".toCharArray());
 368 |         stopWordDictionary.add("任".toCharArray());
 369 |         stopWordDictionary.add("任何".toCharArray());
 370 |         stopWordDictionary.add("任凭".toCharArray());
 371 |         stopWordDictionary.add("如".toCharArray());
 372 |         stopWordDictionary.add("如此".toCharArray());
 373 |         stopWordDictionary.add("如果".toCharArray());
 374 |         stopWordDictionary.add("如何".toCharArray());
 375 |         stopWordDictionary.add("如其".toCharArray());
 376 |         stopWordDictionary.add("如若".toCharArray());
 377 |         stopWordDictionary.add("如上所述".toCharArray());
 378 |         stopWordDictionary.add("若".toCharArray());
 379 |         stopWordDictionary.add("若非".toCharArray());
 380 |         stopWordDictionary.add("若是".toCharArray());
 381 |         stopWordDictionary.add("啥".toCharArray());
 382 |         stopWordDictionary.add("上下".toCharArray());
 383 |         stopWordDictionary.add("尚且".toCharArray());
 384 |         stopWordDictionary.add("设若".toCharArray());
 385 |         stopWordDictionary.add("设使".toCharArray());
 386 |         stopWordDictionary.add("甚而".toCharArray());
 387 |         stopWordDictionary.add("甚么".toCharArray());
 388 |         stopWordDictionary.add("甚至".toCharArray());
 389 |         stopWordDictionary.add("省得".toCharArray());
 390 |         stopWordDictionary.add("时候".toCharArray());
 391 |         stopWordDictionary.add("什么".toCharArray());
 392 |         stopWordDictionary.add("什么样".toCharArray());
 393 |         stopWordDictionary.add("使得".toCharArray());
 394 |         stopWordDictionary.add("是".toCharArray());
 395 |         stopWordDictionary.add("是的".toCharArray());
 396 |         stopWordDictionary.add("首先".toCharArray());
 397 |         stopWordDictionary.add("谁".toCharArray());
 398 |         stopWordDictionary.add("谁知".toCharArray());
 399 |         stopWordDictionary.add("顺".toCharArray());
 400 |         stopWordDictionary.add("顺着".toCharArray());
 401 |         stopWordDictionary.add("似的".toCharArray());
 402 |         stopWordDictionary.add("虽".toCharArray());
 403 |         stopWordDictionary.add("虽然".toCharArray());
 404 |         stopWordDictionary.add("虽说".toCharArray());
 405 |         stopWordDictionary.add("虽则".toCharArray());
 406 |         stopWordDictionary.add("随".toCharArray());
 407 |         stopWordDictionary.add("随着".toCharArray());
 408 |         stopWordDictionary.add("所".toCharArray());
 409 |         stopWordDictionary.add("所以".toCharArray());
 410 |         stopWordDictionary.add("他".toCharArray());
 411 |         stopWordDictionary.add("他们".toCharArray());
 412 |         stopWordDictionary.add("他人".toCharArray());
 413 |         stopWordDictionary.add("它".toCharArray());
 414 |         stopWordDictionary.add("它们".toCharArray());
 415 |         stopWordDictionary.add("她".toCharArray());
 416 |         stopWordDictionary.add("她们".toCharArray());
 417 |         stopWordDictionary.add("倘".toCharArray());
 418 |         stopWordDictionary.add("倘或".toCharArray());
 419 |         stopWordDictionary.add("倘然".toCharArray());
 420 |         stopWordDictionary.add("倘若".toCharArray());
 421 |         stopWordDictionary.add("倘使".toCharArray());
 422 |         stopWordDictionary.add("腾".toCharArray());
 423 |         stopWordDictionary.add("替".toCharArray());
 424 |         stopWordDictionary.add("通过".toCharArray());
 425 |         stopWordDictionary.add("同".toCharArray());
 426 |         stopWordDictionary.add("同时".toCharArray());
 427 |         stopWordDictionary.add("哇".toCharArray());
 428 |         stopWordDictionary.add("万一".toCharArray());
 429 |         stopWordDictionary.add("往".toCharArray());
 430 |         stopWordDictionary.add("望".toCharArray());
 431 |         stopWordDictionary.add("为".toCharArray());
 432 |         stopWordDictionary.add("为何".toCharArray());
 433 |         stopWordDictionary.add("为了".toCharArray());
 434 |         stopWordDictionary.add("为什么".toCharArray());
 435 |         stopWordDictionary.add("为着".toCharArray());
 436 |         stopWordDictionary.add("喂".toCharArray());
 437 |         stopWordDictionary.add("嗡嗡".toCharArray());
 438 |         stopWordDictionary.add("我".toCharArray());
 439 |         stopWordDictionary.add("我们".toCharArray());
 440 |         stopWordDictionary.add("呜".toCharArray());
 441 |         stopWordDictionary.add("呜呼".toCharArray());
 442 |         stopWordDictionary.add("乌乎".toCharArray());
 443 |         stopWordDictionary.add("无论".toCharArray());
 444 |         stopWordDictionary.add("无宁".toCharArray());
 445 |         stopWordDictionary.add("毋宁".toCharArray());
 446 |         stopWordDictionary.add("嘻".toCharArray());
 447 |         stopWordDictionary.add("吓".toCharArray());
 448 |         stopWordDictionary.add("相对而言".toCharArray());
 449 |         stopWordDictionary.add("像".toCharArray());
 450 |         stopWordDictionary.add("向".toCharArray());
 451 |         stopWordDictionary.add("向着".toCharArray());
 452 |         stopWordDictionary.add("嘘".toCharArray());
 453 |         stopWordDictionary.add("呀".toCharArray());
 454 |         stopWordDictionary.add("焉".toCharArray());
 455 |         stopWordDictionary.add("沿".toCharArray());
 456 |         stopWordDictionary.add("沿着".toCharArray());
 457 |         stopWordDictionary.add("要".toCharArray());
 458 |         stopWordDictionary.add("要不".toCharArray());
 459 |         stopWordDictionary.add("要不然".toCharArray());
 460 |         stopWordDictionary.add("要不是".toCharArray());
 461 |         stopWordDictionary.add("要么".toCharArray());
 462 |         stopWordDictionary.add("要是".toCharArray());
 463 |         stopWordDictionary.add("也".toCharArray());
 464 |         stopWordDictionary.add("也罢".toCharArray());
 465 |         stopWordDictionary.add("也好".toCharArray());
 466 |         stopWordDictionary.add("一".toCharArray());
 467 |         stopWordDictionary.add("一般".toCharArray());
 468 |         stopWordDictionary.add("一旦".toCharArray());
 469 |         stopWordDictionary.add("一方面".toCharArray());
 470 |         stopWordDictionary.add("一来".toCharArray());
 471 |         stopWordDictionary.add("一切".toCharArray());
 472 |         stopWordDictionary.add("一样".toCharArray());
 473 |         stopWordDictionary.add("一则".toCharArray());
 474 |         stopWordDictionary.add("依".toCharArray());
 475 |         stopWordDictionary.add("依照".toCharArray());
 476 |         stopWordDictionary.add("矣".toCharArray());
 477 |         stopWordDictionary.add("以".toCharArray());
 478 |         stopWordDictionary.add("以便".toCharArray());
 479 |         stopWordDictionary.add("以及".toCharArray());
 480 |         stopWordDictionary.add("以免".toCharArray());
 481 |         stopWordDictionary.add("以至".toCharArray());
 482 |         stopWordDictionary.add("以至于".toCharArray());
 483 |         stopWordDictionary.add("以致".toCharArray());
 484 |         stopWordDictionary.add("抑或".toCharArray());
 485 |         stopWordDictionary.add("因".toCharArray());
 486 |         stopWordDictionary.add("因此".toCharArray());
 487 |         stopWordDictionary.add("因而".toCharArray());
 488 |         stopWordDictionary.add("因为".toCharArray());
 489 |         stopWordDictionary.add("哟".toCharArray());
 490 |         stopWordDictionary.add("用".toCharArray());
 491 |         stopWordDictionary.add("由".toCharArray());
 492 |         stopWordDictionary.add("由此可见".toCharArray());
 493 |         stopWordDictionary.add("由于".toCharArray());
 494 |         stopWordDictionary.add("有".toCharArray());
 495 |         stopWordDictionary.add("有的".toCharArray());
 496 |         stopWordDictionary.add("有关".toCharArray());
 497 |         stopWordDictionary.add("有些".toCharArray());
 498 |         stopWordDictionary.add("又".toCharArray());
 499 |         stopWordDictionary.add("于".toCharArray());
 500 |         stopWordDictionary.add("于是".toCharArray());
 501 |         stopWordDictionary.add("于是乎".toCharArray());
 502 |         stopWordDictionary.add("与".toCharArray());
 503 |         stopWordDictionary.add("与此同时".toCharArray());
 504 |         stopWordDictionary.add("与否".toCharArray());
 505 |         stopWordDictionary.add("与其".toCharArray());
 506 |         stopWordDictionary.add("越是".toCharArray());
 507 |         stopWordDictionary.add("云云".toCharArray());
 508 |         stopWordDictionary.add("哉".toCharArray());
 509 |         stopWordDictionary.add("再说".toCharArray());
 510 |         stopWordDictionary.add("再者".toCharArray());
 511 |         stopWordDictionary.add("在".toCharArray());
 512 |         stopWordDictionary.add("在下".toCharArray());
 513 |         stopWordDictionary.add("咱".toCharArray());
 514 |         stopWordDictionary.add("咱们".toCharArray());
 515 |         stopWordDictionary.add("则".toCharArray());
 516 |         stopWordDictionary.add("怎".toCharArray());
 517 |         stopWordDictionary.add("怎么".toCharArray());
 518 |         stopWordDictionary.add("怎么办".toCharArray());
 519 |         stopWordDictionary.add("怎么样".toCharArray());
 520 |         stopWordDictionary.add("怎样".toCharArray());
 521 |         stopWordDictionary.add("咋".toCharArray());
 522 |         stopWordDictionary.add("照".toCharArray());
 523 |         stopWordDictionary.add("照着".toCharArray());
 524 |         stopWordDictionary.add("者".toCharArray());
 525 |         stopWordDictionary.add("这".toCharArray());
 526 |         stopWordDictionary.add("这边".toCharArray());
 527 |         stopWordDictionary.add("这儿".toCharArray());
 528 |         stopWordDictionary.add("这个".toCharArray());
 529 |         stopWordDictionary.add("这会儿".toCharArray());
 530 |         stopWordDictionary.add("这就是说".toCharArray());
 531 |         stopWordDictionary.add("这里".toCharArray());
 532 |         stopWordDictionary.add("这么".toCharArray());
 533 |         stopWordDictionary.add("这么点儿".toCharArray());
 534 |         stopWordDictionary.add("这么些".toCharArray());
 535 |         stopWordDictionary.add("这么样".toCharArray());
 536 |         stopWordDictionary.add("这时".toCharArray());
 537 |         stopWordDictionary.add("这些".toCharArray());
 538 |         stopWordDictionary.add("这样".toCharArray());
 539 |         stopWordDictionary.add("正如".toCharArray());
 540 |         stopWordDictionary.add("a".toCharArray());
 541 |         stopWordDictionary.add("an".toCharArray());
 542 |         stopWordDictionary.add("and".toCharArray());
 543 |         stopWordDictionary.add("are".toCharArray());
 544 |         stopWordDictionary.add("as".toCharArray());
 545 |         stopWordDictionary.add("at".toCharArray());
 546 |         stopWordDictionary.add("be".toCharArray());
 547 |         stopWordDictionary.add("but".toCharArray());
 548 |         stopWordDictionary.add("by".toCharArray());
 549 |         stopWordDictionary.add("for".toCharArray());
 550 |         stopWordDictionary.add("if".toCharArray());
 551 |         stopWordDictionary.add("in".toCharArray());
 552 |         stopWordDictionary.add("into".toCharArray());
 553 |         stopWordDictionary.add("is".toCharArray());
 554 |         stopWordDictionary.add("it".toCharArray());
 555 |         stopWordDictionary.add("no".toCharArray());
 556 |         stopWordDictionary.add("not".toCharArray());
 557 |         stopWordDictionary.add("of".toCharArray());
 558 |         stopWordDictionary.add("on".toCharArray());
 559 |         stopWordDictionary.add("or".toCharArray());
 560 |         stopWordDictionary.add("such".toCharArray());
 561 |         stopWordDictionary.add("that".toCharArray());
 562 |         stopWordDictionary.add("the".toCharArray());
 563 |         stopWordDictionary.add("their".toCharArray());
 564 |         stopWordDictionary.add("then".toCharArray());
 565 |         stopWordDictionary.add("there".toCharArray());
 566 |         stopWordDictionary.add("these".toCharArray());
 567 |         stopWordDictionary.add("they".toCharArray());
 568 |         stopWordDictionary.add("this".toCharArray());
 569 |         stopWordDictionary.add("to".toCharArray());
 570 |         stopWordDictionary.add("was".toCharArray());
 571 |         stopWordDictionary.add("will".toCharArray());
 572 |         stopWordDictionary.add("with".toCharArray());
 573 |         stopWordDictionary.add("﻿更好的".toCharArray());
 574 |         stopWordDictionary.add("选择".toCharArray());
 575 |         stopWordDictionary.add("啊".toCharArray());
 576 |         stopWordDictionary.add("阿".toCharArray());
 577 |         stopWordDictionary.add("哎".toCharArray());
 578 |         stopWordDictionary.add("哎呀".toCharArray());
 579 |         stopWordDictionary.add("哎哟".toCharArray());
 580 |         stopWordDictionary.add("唉".toCharArray());
 581 |         stopWordDictionary.add("俺".toCharArray());
 582 |         stopWordDictionary.add("俺们".toCharArray());
 583 |         stopWordDictionary.add("按".toCharArray());
 584 |         stopWordDictionary.add("按照".toCharArray());
 585 |         stopWordDictionary.add("吧".toCharArray());
 586 |         stopWordDictionary.add("吧哒".toCharArray());
 587 |         stopWordDictionary.add("把".toCharArray());
 588 |         stopWordDictionary.add("罢了".toCharArray());
 589 |         stopWordDictionary.add("被".toCharArray());
 590 |         stopWordDictionary.add("本".toCharArray());
 591 |         stopWordDictionary.add("本着".toCharArray());
 592 |         stopWordDictionary.add("比".toCharArray());
 593 |         stopWordDictionary.add("比方".toCharArray());
 594 |         stopWordDictionary.add("比如".toCharArray());
 595 |         stopWordDictionary.add("鄙人".toCharArray());
 596 |         stopWordDictionary.add("彼".toCharArray());
 597 |         stopWordDictionary.add("彼此".toCharArray());
 598 |         stopWordDictionary.add("边".toCharArray());
 599 |         stopWordDictionary.add("别".toCharArray());
 600 |         stopWordDictionary.add("别的".toCharArray());
 601 |         stopWordDictionary.add("别说".toCharArray());
 602 |         stopWordDictionary.add("并".toCharArray());
 603 |         stopWordDictionary.add("并且".toCharArray());
 604 |         stopWordDictionary.add("不比".toCharArray());
 605 |         stopWordDictionary.add("不成".toCharArray());
 606 |         stopWordDictionary.add("不单".toCharArray());
 607 |         stopWordDictionary.add("不但".toCharArray());
 608 |         stopWordDictionary.add("不独".toCharArray());
 609 |         stopWordDictionary.add("不管".toCharArray());
 610 |         stopWordDictionary.add("不光".toCharArray());
 611 |         stopWordDictionary.add("不过".toCharArray());
 612 |         stopWordDictionary.add("不仅".toCharArray());
 613 |         stopWordDictionary.add("不拘".toCharArray());
 614 |         stopWordDictionary.add("不论".toCharArray());
 615 |         stopWordDictionary.add("不怕".toCharArray());
 616 |         stopWordDictionary.add("不然".toCharArray());
 617 |         stopWordDictionary.add("不如".toCharArray());
 618 |         stopWordDictionary.add("不特".toCharArray());
 619 |         stopWordDictionary.add("不惟".toCharArray());
 620 |         stopWordDictionary.add("不问".toCharArray());
 621 |         stopWordDictionary.add("不只".toCharArray());
 622 |         stopWordDictionary.add("朝".toCharArray());
 623 |         stopWordDictionary.add("朝着".toCharArray());
 624 |         stopWordDictionary.add("趁".toCharArray());
 625 |         stopWordDictionary.add("趁着".toCharArray());
 626 |         stopWordDictionary.add("乘".toCharArray());
 627 |         stopWordDictionary.add("冲".toCharArray());
 628 |         stopWordDictionary.add("除".toCharArray());
 629 |         stopWordDictionary.add("除此之外".toCharArray());
 630 |         stopWordDictionary.add("除非".toCharArray());
 631 |         stopWordDictionary.add("除了".toCharArray());
 632 |         stopWordDictionary.add("此".toCharArray());
 633 |         stopWordDictionary.add("此间".toCharArray());
 634 |         stopWordDictionary.add("此外".toCharArray());
 635 |         stopWordDictionary.add("从".toCharArray());
 636 |         stopWordDictionary.add("从而".toCharArray());
 637 |         stopWordDictionary.add("打".toCharArray());
 638 |         stopWordDictionary.add("待".toCharArray());
 639 |         stopWordDictionary.add("但".toCharArray());
 640 |         stopWordDictionary.add("但是".toCharArray());
 641 |         stopWordDictionary.add("当".toCharArray());
 642 |         stopWordDictionary.add("当着".toCharArray());
 643 |         stopWordDictionary.add("到".toCharArray());
 644 |         stopWordDictionary.add("得".toCharArray());
 645 |         stopWordDictionary.add("的".toCharArray());
 646 |         stopWordDictionary.add("的话".toCharArray());
 647 |         stopWordDictionary.add("等".toCharArray());
 648 |         stopWordDictionary.add("等等".toCharArray());
 649 |         stopWordDictionary.add("地".toCharArray());
 650 |         stopWordDictionary.add("第".toCharArray());
 651 |         stopWordDictionary.add("叮咚".toCharArray());
 652 |         stopWordDictionary.add("对".toCharArray());
 653 |         stopWordDictionary.add("对于".toCharArray());
 654 |         stopWordDictionary.add("多".toCharArray());
 655 |         stopWordDictionary.add("多少".toCharArray());
 656 |         stopWordDictionary.add("而".toCharArray());
 657 |         stopWordDictionary.add("而况".toCharArray());
 658 |         stopWordDictionary.add("而且".toCharArray());
 659 |         stopWordDictionary.add("而是".toCharArray());
 660 |         stopWordDictionary.add("而外".toCharArray());
 661 |         stopWordDictionary.add("而言".toCharArray());
 662 |         stopWordDictionary.add("而已".toCharArray());
 663 |         stopWordDictionary.add("尔后".toCharArray());
 664 |         stopWordDictionary.add("反过来".toCharArray());
 665 |         stopWordDictionary.add("反过来说".toCharArray());
 666 |         stopWordDictionary.add("反之".toCharArray());
 667 |         stopWordDictionary.add("非但".toCharArray());
 668 |         stopWordDictionary.add("非徒".toCharArray());
 669 |         stopWordDictionary.add("否则".toCharArray());
 670 |         stopWordDictionary.add("嘎".toCharArray());
 671 |         stopWordDictionary.add("嘎登".toCharArray());
 672 |         stopWordDictionary.add("该".toCharArray());
 673 |         stopWordDictionary.add("赶".toCharArray());
 674 |         stopWordDictionary.add("个".toCharArray());
 675 |         stopWordDictionary.add("各".toCharArray());
 676 |         stopWordDictionary.add("各个".toCharArray());
 677 |         stopWordDictionary.add("各位".toCharArray());
 678 |         stopWordDictionary.add("各种".toCharArray());
 679 |         stopWordDictionary.add("各自".toCharArray());
 680 |         stopWordDictionary.add("给".toCharArray());
 681 |         stopWordDictionary.add("根据".toCharArray());
 682 |         stopWordDictionary.add("跟".toCharArray());
 683 |         stopWordDictionary.add("故".toCharArray());
 684 |         stopWordDictionary.add("故此".toCharArray());
 685 |         stopWordDictionary.add("固然".toCharArray());
 686 |         stopWordDictionary.add("关于".toCharArray());
 687 |         stopWordDictionary.add("管".toCharArray());
 688 |         stopWordDictionary.add("归".toCharArray());
 689 |         stopWordDictionary.add("果然".toCharArray());
 690 |         stopWordDictionary.add("果真".toCharArray());
 691 |         stopWordDictionary.add("过".toCharArray());
 692 |         stopWordDictionary.add("哈".toCharArray());
 693 |         stopWordDictionary.add("哈哈".toCharArray());
 694 |         stopWordDictionary.add("呵".toCharArray());
 695 |         stopWordDictionary.add("和".toCharArray());
 696 |         stopWordDictionary.add("何".toCharArray());
 697 |         stopWordDictionary.add("何处".toCharArray());
 698 |         stopWordDictionary.add("何况".toCharArray());
 699 |         stopWordDictionary.add("何时".toCharArray());
 700 |         stopWordDictionary.add("嘿".toCharArray());
 701 |         stopWordDictionary.add("哼".toCharArray());
 702 |         stopWordDictionary.add("哼唷".toCharArray());
 703 |         stopWordDictionary.add("呼哧".toCharArray());
 704 |         stopWordDictionary.add("乎".toCharArray());
 705 |         stopWordDictionary.add("哗".toCharArray());
 706 |         stopWordDictionary.add("还是".toCharArray());
 707 |         stopWordDictionary.add("还有".toCharArray());
 708 |         stopWordDictionary.add("换句话说".toCharArray());
 709 |         stopWordDictionary.add("换言之".toCharArray());
 710 |         stopWordDictionary.add("或".toCharArray());
 711 |         stopWordDictionary.add("或是".toCharArray());
 712 |         stopWordDictionary.add("或者".toCharArray());
 713 |         stopWordDictionary.add("极了".toCharArray());
 714 |         stopWordDictionary.add("及".toCharArray());
 715 |         stopWordDictionary.add("及其".toCharArray());
 716 |         stopWordDictionary.add("及至".toCharArray());
 717 |         stopWordDictionary.add("即".toCharArray());
 718 |         stopWordDictionary.add("即便".toCharArray());
 719 |         stopWordDictionary.add("即或".toCharArray());
 720 |         stopWordDictionary.add("即令".toCharArray());
 721 |         stopWordDictionary.add("即若".toCharArray());
 722 |         stopWordDictionary.add("即使".toCharArray());
 723 |         stopWordDictionary.add("几".toCharArray());
 724 |         stopWordDictionary.add("几时".toCharArray());
 725 |         stopWordDictionary.add("己".toCharArray());
 726 |         stopWordDictionary.add("既".toCharArray());
 727 |         stopWordDictionary.add("既然".toCharArray());
 728 |         stopWordDictionary.add("既是".toCharArray());
 729 |         stopWordDictionary.add("继而".toCharArray());
 730 |         stopWordDictionary.add("加之".toCharArray());
 731 |         stopWordDictionary.add("假如".toCharArray());
 732 |         stopWordDictionary.add("假若".toCharArray());
 733 |         stopWordDictionary.add("假使".toCharArray());
 734 |         stopWordDictionary.add("鉴于".toCharArray());
 735 |         stopWordDictionary.add("将".toCharArray());
 736 |         stopWordDictionary.add("较".toCharArray());
 737 |         stopWordDictionary.add("较之".toCharArray());
 738 |         stopWordDictionary.add("叫".toCharArray());
 739 |         stopWordDictionary.add("接着".toCharArray());
 740 |         stopWordDictionary.add("结果".toCharArray());
 741 |         stopWordDictionary.add("借".toCharArray());
 742 |         stopWordDictionary.add("紧接着".toCharArray());
 743 |         stopWordDictionary.add("进而".toCharArray());
 744 |         stopWordDictionary.add("尽".toCharArray());
 745 |         stopWordDictionary.add("尽管".toCharArray());
 746 |         stopWordDictionary.add("经".toCharArray());
 747 |         stopWordDictionary.add("经过".toCharArray());
 748 |         stopWordDictionary.add("就".toCharArray());
 749 |         stopWordDictionary.add("就是".toCharArray());
 750 |         stopWordDictionary.add("就是说".toCharArray());
 751 |         stopWordDictionary.add("据".toCharArray());
 752 |         stopWordDictionary.add("具体地说".toCharArray());
 753 |         stopWordDictionary.add("具体说来".toCharArray());
 754 |         stopWordDictionary.add("开始".toCharArray());
 755 |         stopWordDictionary.add("开外".toCharArray());
 756 |         stopWordDictionary.add("靠".toCharArray());
 757 |         stopWordDictionary.add("咳".toCharArray());
 758 |         stopWordDictionary.add("可".toCharArray());
 759 |         stopWordDictionary.add("可见".toCharArray());
 760 |         stopWordDictionary.add("可是".toCharArray());
 761 |         stopWordDictionary.add("可以".toCharArray());
 762 |         stopWordDictionary.add("况且".toCharArray());
 763 |         stopWordDictionary.add("啦".toCharArray());
 764 |         stopWordDictionary.add("来".toCharArray());
 765 |         stopWordDictionary.add("来着".toCharArray());
 766 |         stopWordDictionary.add("离".toCharArray());
 767 |         stopWordDictionary.add("例如".toCharArray());
 768 |         stopWordDictionary.add("哩".toCharArray());
 769 |         stopWordDictionary.add("连".toCharArray());
 770 |         stopWordDictionary.add("连同".toCharArray());
 771 |         stopWordDictionary.add("两者".toCharArray());
 772 |         stopWordDictionary.add("了".toCharArray());
 773 |         stopWordDictionary.add("临".toCharArray());
 774 |         stopWordDictionary.add("另".toCharArray());
 775 |         stopWordDictionary.add("另外".toCharArray());
 776 |         stopWordDictionary.add("另一方面".toCharArray());
 777 |         stopWordDictionary.add("论".toCharArray());
 778 |         stopWordDictionary.add("嘛".toCharArray());
 779 |         stopWordDictionary.add("吗".toCharArray());
 780 |         stopWordDictionary.add("慢说".toCharArray());
 781 |         stopWordDictionary.add("漫说".toCharArray());
 782 |         stopWordDictionary.add("冒".toCharArray());
 783 |         stopWordDictionary.add("么".toCharArray());
 784 |         stopWordDictionary.add("每".toCharArray());
 785 |         stopWordDictionary.add("每当".toCharArray());
 786 |         stopWordDictionary.add("们".toCharArray());
 787 |         stopWordDictionary.add("莫若".toCharArray());
 788 |         stopWordDictionary.add("某".toCharArray());
 789 |         stopWordDictionary.add("某个".toCharArray());
 790 |         stopWordDictionary.add("某些".toCharArray());
 791 |         stopWordDictionary.add("拿".toCharArray());
 792 |         stopWordDictionary.add("哪".toCharArray());
 793 |         stopWordDictionary.add("哪边".toCharArray());
 794 |         stopWordDictionary.add("哪儿".toCharArray());
 795 |         stopWordDictionary.add("哪个".toCharArray());
 796 |         stopWordDictionary.add("哪里".toCharArray());
 797 |         stopWordDictionary.add("哪年".toCharArray());
 798 |         stopWordDictionary.add("哪怕".toCharArray());
 799 |         stopWordDictionary.add("哪天".toCharArray());
 800 |         stopWordDictionary.add("哪些".toCharArray());
 801 |         stopWordDictionary.add("哪样".toCharArray());
 802 |         stopWordDictionary.add("那".toCharArray());
 803 |         stopWordDictionary.add("那边".toCharArray());
 804 |         stopWordDictionary.add("那儿".toCharArray());
 805 |         stopWordDictionary.add("那个".toCharArray());
 806 |         stopWordDictionary.add("那会儿".toCharArray());
 807 |         stopWordDictionary.add("那里".toCharArray());
 808 |         stopWordDictionary.add("那么".toCharArray());
 809 |         stopWordDictionary.add("那么些".toCharArray());
 810 |         stopWordDictionary.add("那么样".toCharArray());
 811 |         stopWordDictionary.add("那时".toCharArray());
 812 |         stopWordDictionary.add("那些".toCharArray());
 813 |         stopWordDictionary.add("那样".toCharArray());
 814 |         stopWordDictionary.add("乃".toCharArray());
 815 |         stopWordDictionary.add("乃至".toCharArray());
 816 |         stopWordDictionary.add("呢".toCharArray());
 817 |         stopWordDictionary.add("能".toCharArray());
 818 |         stopWordDictionary.add("你".toCharArray());
 819 |         stopWordDictionary.add("你们".toCharArray());
 820 |         stopWordDictionary.add("您".toCharArray());
 821 |         stopWordDictionary.add("宁".toCharArray());
 822 |         stopWordDictionary.add("宁可".toCharArray());
 823 |         stopWordDictionary.add("宁肯".toCharArray());
 824 |         stopWordDictionary.add("宁愿".toCharArray());
 825 |         stopWordDictionary.add("哦".toCharArray());
 826 |         stopWordDictionary.add("呕".toCharArray());
 827 |         stopWordDictionary.add("啪达".toCharArray());
 828 |         stopWordDictionary.add("旁人".toCharArray());
 829 |         stopWordDictionary.add("呸".toCharArray());
 830 |         stopWordDictionary.add("凭".toCharArray());
 831 |         stopWordDictionary.add("凭借".toCharArray());
 832 |         stopWordDictionary.add("其".toCharArray());
 833 |         stopWordDictionary.add("其次".toCharArray());
 834 |         stopWordDictionary.add("其二".toCharArray());
 835 |         stopWordDictionary.add("其他".toCharArray());
 836 |         stopWordDictionary.add("其它".toCharArray());
 837 |         stopWordDictionary.add("其一".toCharArray());
 838 |         stopWordDictionary.add("其余".toCharArray());
 839 |         stopWordDictionary.add("其中".toCharArray());
 840 |         stopWordDictionary.add("起".toCharArray());
 841 |         stopWordDictionary.add("起见".toCharArray());
 842 |         stopWordDictionary.add("起见".toCharArray());
 843 |         stopWordDictionary.add("岂但".toCharArray());
 844 |         stopWordDictionary.add("恰恰相反".toCharArray());
 845 |         stopWordDictionary.add("前后".toCharArray());
 846 |         stopWordDictionary.add("前者".toCharArray());
 847 |         stopWordDictionary.add("且".toCharArray());
 848 |         stopWordDictionary.add("然而".toCharArray());
 849 |         stopWordDictionary.add("然后".toCharArray());
 850 |         stopWordDictionary.add("然则".toCharArray());
 851 |         stopWordDictionary.add("让".toCharArray());
 852 |         stopWordDictionary.add("人家".toCharArray());
 853 |         stopWordDictionary.add("任".toCharArray());
 854 |         stopWordDictionary.add("任何".toCharArray());
 855 |         stopWordDictionary.add("任凭".toCharArray());
 856 |         stopWordDictionary.add("如".toCharArray());
 857 |         stopWordDictionary.add("如此".toCharArray());
 858 |         stopWordDictionary.add("如果".toCharArray());
 859 |         stopWordDictionary.add("如何".toCharArray());
 860 |         stopWordDictionary.add("如其".toCharArray());
 861 |         stopWordDictionary.add("如若".toCharArray());
 862 |         stopWordDictionary.add("如上所述".toCharArray());
 863 |         stopWordDictionary.add("若".toCharArray());
 864 |         stopWordDictionary.add("若非".toCharArray());
 865 |         stopWordDictionary.add("若是".toCharArray());
 866 |         stopWordDictionary.add("啥".toCharArray());
 867 |         stopWordDictionary.add("上下".toCharArray());
 868 |         stopWordDictionary.add("尚且".toCharArray());
 869 |         stopWordDictionary.add("设若".toCharArray());
 870 |         stopWordDictionary.add("设使".toCharArray());
 871 |         stopWordDictionary.add("甚而".toCharArray());
 872 |         stopWordDictionary.add("甚么".toCharArray());
 873 |         stopWordDictionary.add("甚至".toCharArray());
 874 |         stopWordDictionary.add("省得".toCharArray());
 875 |         stopWordDictionary.add("时候".toCharArray());
 876 |         stopWordDictionary.add("什么".toCharArray());
 877 |         stopWordDictionary.add("什么样".toCharArray());
 878 |         stopWordDictionary.add("使得".toCharArray());
 879 |         stopWordDictionary.add("是".toCharArray());
 880 |         stopWordDictionary.add("是的".toCharArray());
 881 |         stopWordDictionary.add("首先".toCharArray());
 882 |         stopWordDictionary.add("谁".toCharArray());
 883 |         stopWordDictionary.add("谁知".toCharArray());
 884 |         stopWordDictionary.add("顺".toCharArray());
 885 |         stopWordDictionary.add("顺着".toCharArray());
 886 |         stopWordDictionary.add("似的".toCharArray());
 887 |         stopWordDictionary.add("虽".toCharArray());
 888 |         stopWordDictionary.add("虽然".toCharArray());
 889 |         stopWordDictionary.add("虽说".toCharArray());
 890 |         stopWordDictionary.add("虽则".toCharArray());
 891 |         stopWordDictionary.add("随".toCharArray());
 892 |         stopWordDictionary.add("随着".toCharArray());
 893 |         stopWordDictionary.add("所".toCharArray());
 894 |         stopWordDictionary.add("所以".toCharArray());
 895 |         stopWordDictionary.add("他".toCharArray());
 896 |         stopWordDictionary.add("他们".toCharArray());
 897 |         stopWordDictionary.add("他人".toCharArray());
 898 |         stopWordDictionary.add("它".toCharArray());
 899 |         stopWordDictionary.add("它们".toCharArray());
 900 |         stopWordDictionary.add("她".toCharArray());
 901 |         stopWordDictionary.add("她们".toCharArray());
 902 |         stopWordDictionary.add("倘".toCharArray());
 903 |         stopWordDictionary.add("倘或".toCharArray());
 904 |         stopWordDictionary.add("倘然".toCharArray());
 905 |         stopWordDictionary.add("倘若".toCharArray());
 906 |         stopWordDictionary.add("倘使".toCharArray());
 907 |         stopWordDictionary.add("腾".toCharArray());
 908 |         stopWordDictionary.add("替".toCharArray());
 909 |         stopWordDictionary.add("通过".toCharArray());
 910 |         stopWordDictionary.add("同".toCharArray());
 911 |         stopWordDictionary.add("同时".toCharArray());
 912 |         stopWordDictionary.add("哇".toCharArray());
 913 |         stopWordDictionary.add("万一".toCharArray());
 914 |         stopWordDictionary.add("往".toCharArray());
 915 |         stopWordDictionary.add("望".toCharArray());
 916 |         stopWordDictionary.add("为".toCharArray());
 917 |         stopWordDictionary.add("为何".toCharArray());
 918 |         stopWordDictionary.add("为了".toCharArray());
 919 |         stopWordDictionary.add("为什么".toCharArray());
 920 |         stopWordDictionary.add("为着".toCharArray());
 921 |         stopWordDictionary.add("喂".toCharArray());
 922 |         stopWordDictionary.add("嗡嗡".toCharArray());
 923 |         stopWordDictionary.add("我".toCharArray());
 924 |         stopWordDictionary.add("我们".toCharArray());
 925 |         stopWordDictionary.add("呜".toCharArray());
 926 |         stopWordDictionary.add("呜呼".toCharArray());
 927 |         stopWordDictionary.add("乌乎".toCharArray());
 928 |         stopWordDictionary.add("无论".toCharArray());
 929 |         stopWordDictionary.add("无宁".toCharArray());
 930 |         stopWordDictionary.add("毋宁".toCharArray());
 931 |         stopWordDictionary.add("嘻".toCharArray());
 932 |         stopWordDictionary.add("吓".toCharArray());
 933 |         stopWordDictionary.add("相对而言".toCharArray());
 934 |         stopWordDictionary.add("像".toCharArray());
 935 |         stopWordDictionary.add("向".toCharArray());
 936 |         stopWordDictionary.add("向着".toCharArray());
 937 |         stopWordDictionary.add("嘘".toCharArray());
 938 |         stopWordDictionary.add("呀".toCharArray());
 939 |         stopWordDictionary.add("焉".toCharArray());
 940 |         stopWordDictionary.add("沿".toCharArray());
 941 |         stopWordDictionary.add("沿着".toCharArray());
 942 |         stopWordDictionary.add("要".toCharArray());
 943 |         stopWordDictionary.add("要不".toCharArray());
 944 |         stopWordDictionary.add("要不然".toCharArray());
 945 |         stopWordDictionary.add("要不是".toCharArray());
 946 |         stopWordDictionary.add("要么".toCharArray());
 947 |         stopWordDictionary.add("要是".toCharArray());
 948 |         stopWordDictionary.add("也".toCharArray());
 949 |         stopWordDictionary.add("也罢".toCharArray());
 950 |         stopWordDictionary.add("也好".toCharArray());
 951 |         stopWordDictionary.add("一".toCharArray());
 952 |         stopWordDictionary.add("一般".toCharArray());
 953 |         stopWordDictionary.add("一旦".toCharArray());
 954 |         stopWordDictionary.add("一方面".toCharArray());
 955 |         stopWordDictionary.add("一来".toCharArray());
 956 |         stopWordDictionary.add("一切".toCharArray());
 957 |         stopWordDictionary.add("一样".toCharArray());
 958 |         stopWordDictionary.add("一则".toCharArray());
 959 |         stopWordDictionary.add("依".toCharArray());
 960 |         stopWordDictionary.add("依照".toCharArray());
 961 |         stopWordDictionary.add("矣".toCharArray());
 962 |         stopWordDictionary.add("以".toCharArray());
 963 |         stopWordDictionary.add("以便".toCharArray());
 964 |         stopWordDictionary.add("以及".toCharArray());
 965 |         stopWordDictionary.add("以免".toCharArray());
 966 |         stopWordDictionary.add("以至".toCharArray());
 967 |         stopWordDictionary.add("以至于".toCharArray());
 968 |         stopWordDictionary.add("以致".toCharArray());
 969 |         stopWordDictionary.add("抑或".toCharArray());
 970 |         stopWordDictionary.add("因".toCharArray());
 971 |         stopWordDictionary.add("因此".toCharArray());
 972 |         stopWordDictionary.add("因而".toCharArray());
 973 |         stopWordDictionary.add("因为".toCharArray());
 974 |         stopWordDictionary.add("哟".toCharArray());
 975 |         stopWordDictionary.add("用".toCharArray());
 976 |         stopWordDictionary.add("由".toCharArray());
 977 |         stopWordDictionary.add("由此可见".toCharArray());
 978 |         stopWordDictionary.add("由于".toCharArray());
 979 |         stopWordDictionary.add("有".toCharArray());
 980 |         stopWordDictionary.add("有的".toCharArray());
 981 |         stopWordDictionary.add("有关".toCharArray());
 982 |         stopWordDictionary.add("有些".toCharArray());
 983 |         stopWordDictionary.add("又".toCharArray());
 984 |         stopWordDictionary.add("于".toCharArray());
 985 |         stopWordDictionary.add("于是".toCharArray());
 986 |         stopWordDictionary.add("于是乎".toCharArray());
 987 |         stopWordDictionary.add("与".toCharArray());
 988 |         stopWordDictionary.add("与此同时".toCharArray());
 989 |         stopWordDictionary.add("与否".toCharArray());
 990 |         stopWordDictionary.add("与其".toCharArray());
 991 |         stopWordDictionary.add("越是".toCharArray());
 992 |         stopWordDictionary.add("云云".toCharArray());
 993 |         stopWordDictionary.add("哉".toCharArray());
 994 |         stopWordDictionary.add("再说".toCharArray());
 995 |         stopWordDictionary.add("再者".toCharArray());
 996 |         stopWordDictionary.add("在".toCharArray());
 997 |         stopWordDictionary.add("在下".toCharArray());
 998 |         stopWordDictionary.add("咱".toCharArray());
 999 |         stopWordDictionary.add("咱们".toCharArray());
1000 |         stopWordDictionary.add("则".toCharArray());
1001 |         stopWordDictionary.add("怎".toCharArray());
1002 |         stopWordDictionary.add("怎么".toCharArray());
1003 |         stopWordDictionary.add("怎么办".toCharArray());
1004 |         stopWordDictionary.add("怎么样".toCharArray());
1005 |         stopWordDictionary.add("怎样".toCharArray());
1006 |         stopWordDictionary.add("咋".toCharArray());
1007 |         stopWordDictionary.add("照".toCharArray());
1008 |         stopWordDictionary.add("照着".toCharArray());
1009 |         stopWordDictionary.add("者".toCharArray());
1010 |         stopWordDictionary.add("这".toCharArray());
1011 |         stopWordDictionary.add("这边".toCharArray());
1012 |         stopWordDictionary.add("这儿".toCharArray());
1013 |         stopWordDictionary.add("这个".toCharArray());
1014 |         stopWordDictionary.add("这会儿".toCharArray());
1015 |         stopWordDictionary.add("这就是说".toCharArray());
1016 |         stopWordDictionary.add("这里".toCharArray());
1017 |         stopWordDictionary.add("这么".toCharArray());
1018 |         stopWordDictionary.add("这么点儿".toCharArray());
1019 |         stopWordDictionary.add("这么些".toCharArray());
1020 |         stopWordDictionary.add("这么样".toCharArray());
1021 |         stopWordDictionary.add("这时".toCharArray());
1022 |         stopWordDictionary.add("这些".toCharArray());
1023 |         stopWordDictionary.add("这样".toCharArray());
1024 |         stopWordDictionary.add("正如".toCharArray());
1025 |         stopWordDictionary.add("吱".toCharArray());
1026 |         stopWordDictionary.add("之".toCharArray());
1027 |         stopWordDictionary.add("之类".toCharArray());
1028 |         stopWordDictionary.add("之所以".toCharArray());
1029 |         stopWordDictionary.add("之一".toCharArray());
1030 |         stopWordDictionary.add("只是".toCharArray());
1031 |         stopWordDictionary.add("只限".toCharArray());
1032 |         stopWordDictionary.add("只要".toCharArray());
1033 |         stopWordDictionary.add("只有".toCharArray());
1034 |         stopWordDictionary.add("至".toCharArray());
1035 |         stopWordDictionary.add("至于".toCharArray());
1036 |         stopWordDictionary.add("诸位".toCharArray());
1037 |         stopWordDictionary.add("着".toCharArray());
1038 |         stopWordDictionary.add("着呢".toCharArray());
1039 |         stopWordDictionary.add("自".toCharArray());
1040 |         stopWordDictionary.add("自从".toCharArray());
1041 |         stopWordDictionary.add("自个儿".toCharArray());
1042 |         stopWordDictionary.add("自各儿".toCharArray());
1043 |         stopWordDictionary.add("自己".toCharArray());
1044 |         stopWordDictionary.add("自家".toCharArray());
1045 |         stopWordDictionary.add("自身".toCharArray());
1046 |         stopWordDictionary.add("综上所述".toCharArray());
1047 |         stopWordDictionary.add("总的来看".toCharArray());
1048 |         stopWordDictionary.add("总的来说".toCharArray());
1049 |         stopWordDictionary.add("总的说来".toCharArray());
1050 |         stopWordDictionary.add("总而言之".toCharArray());
1051 |         stopWordDictionary.add("总之".toCharArray());
1052 |         stopWordDictionary.add("纵".toCharArray());
1053 |         stopWordDictionary.add("纵令".toCharArray());
1054 |         stopWordDictionary.add("纵然".toCharArray());
1055 |         stopWordDictionary.add("纵使".toCharArray());
1056 |         stopWordDictionary.add("遵照".toCharArray());
1057 |         stopWordDictionary.add("作为".toCharArray());
1058 |         stopWordDictionary.add("兮".toCharArray());
1059 |         stopWordDictionary.add("呃".toCharArray());
1060 |         stopWordDictionary.add("呗".toCharArray());
1061 |         stopWordDictionary.add("咚".toCharArray());
1062 |         stopWordDictionary.add("咦".toCharArray());
1063 |         stopWordDictionary.add("喏".toCharArray());
1064 |         stopWordDictionary.add("啐".toCharArray());
1065 |         stopWordDictionary.add("喔唷".toCharArray());
1066 |         stopWordDictionary.add("嗬".toCharArray());
1067 |         stopWordDictionary.add("嗯".toCharArray());
1068 |         stopWordDictionary.add("嗳".toCharArray());
1069 |     }
1070 | 
1071 | 
1072 |     /**
1073 |      * 返回useSmart标志位
1074 |      * isSmartMode =true ，分词器使用智能切分策略， =false则使用细粒度切分
1075 |      *
1076 |      * @return isSmartMode
1077 |      */
1078 |     public boolean isSmartMode() {
1079 |         return smartMode;
1080 |     }
1081 | 
1082 |     /**
1083 |      * 设置useSmart标志位
1084 |      * isSmartMode =true ，分词器使用智能切分策略， =false则使用细粒度切分
1085 |      *
1086 |      * @param smartMode
1087 |      */
1088 |     public void setSmartMode(boolean smartMode) {
1089 |         this.smartMode = smartMode;
1090 |     }
1091 | 
1092 |     @Override
1093 |     public List<char[]> getMainDictionary() {
1094 |         return mainDictionary;
1095 |     }
1096 | 
1097 |     @Override
1098 |     public List<char[]> getStopWordDictionary() {
1099 |         return stopWordDictionary;
1100 |     }
1101 | 
1102 |     @Override
1103 |     public List<char[]> getQuantifierDictionary() {
1104 |         return quantifierDictionary;
1105 |     }
1106 | 
1107 | 
1108 | }
1109 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/.gitignore:
--------------------------------------------------------------------------------
1 | ./data/
2 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'distribution'
 2 | 
 3 | ext {
 4 |     LUCENCE_VERSION = '4.10.4'
 5 |     ELASTICSEARCH_VERSION = '1.6.0'
 6 | }
 7 | 
 8 | group = "io.github.zacker330.es"
 9 | archivesBaseName = "ik-analysis-es-plugin"
10 | version = "1.0.1"
11 | 
12 | dependencies {
13 | 
14 |     compile project(':ik-analysis-core')
15 | 
16 |     compile("org.elasticsearch:elasticsearch:$ELASTICSEARCH_VERSION")
17 |     compile("org.apache.lucene:lucene-core:$LUCENCE_VERSION")
18 |     compile("org.apache.lucene:lucene-queryparser:$LUCENCE_VERSION")
19 |     compile("org.apache.lucene:lucene-analyzers-common:$LUCENCE_VERSION")
20 |     runtime('ch.qos.logback:logback-classic:1.1.3')
21 | 
22 |     testCompile("org.apache.lucene:lucene-test-framework:$LUCENCE_VERSION") {
23 |         exclude module: 'randomizedtesting-runner'
24 |     }
25 | 
26 |     testCompile('junit:junit:4.12')
27 |     testCompile('org.hamcrest:hamcrest-all:1.3')
28 |     testCompile("com.carrotsearch.randomizedtesting:randomizedtesting-runner:2.1.16")
29 | 
30 |     testCompile group: 'org.elasticsearch', name: 'elasticsearch', version: ELASTICSEARCH_VERSION, classifier: 'tests'
31 | 
32 | }
33 | 
34 | 
35 | modifyPom {
36 |     project {
37 |         name 'es-ik'
38 |         description 'Kind of Chinese Analysis for Elasticsearch'
39 |         url 'https://github.com/zacker330/es-ik'
40 |         inceptionYear '2015'
41 | 
42 |         scm {
43 |             url 'https://github.com/zacker330/es-ik'
44 |             connection 'scm:https://github.com/zacker330/es-ik.git'
45 |             developerConnection 'scm:git@github.com:zacker330/es-ik.git'
46 |         }
47 | 
48 |         licenses {
49 |             license {
50 |                 name 'The Apache Software License, Version 2.0'
51 |                 url 'http://www.apache.org/licenses/LICENSE-2.0.txt'
52 |                 distribution 'repo'
53 |             }
54 |         }
55 | 
56 |         developers {
57 |             developer {
58 |                 id 'zacker330'
59 |                 name 'Jack'
60 |                 email 'zacker330@gmail.com'
61 |             }
62 |         }
63 |     }
64 | }
65 | 
66 | extraArchive {
67 |     sources = true
68 |     tests = true
69 |     javadoc = true
70 | }
71 | 
72 | 
73 | 
74 | distributions {
75 |     main {
76 |         baseName = 'ik-analysis-es-plugin'
77 |         contents {
78 |             from { "build/libs/" }
79 |             from { "libs/" }
80 |             from { project(":ik-analysis-core").buildDir.path + '/libs/' }
81 |             from { project(":ik-analysis-es-plugin").buildDir.path + '/libs/' }
82 |         }
83 | 
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/IKAnalysisBinderProcessor.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.ik;
 2 | 
 3 | import org.elasticsearch.index.analysis.AnalysisModule;
 4 | 
 5 | public class IKAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
 6 |     @Override
 7 |     public void processAnalyzers(AnalyzersBindings analyzersBindings) {
 8 |         analyzersBindings.processAnalyzer("ik_analysis", IKAnalyzerProvider.class);
 9 |     }
10 | 
11 |     @Override
12 |     public void processTokenizers(TokenizersBindings tokenizersBindings) {
13 |         tokenizersBindings.processTokenizer("ik_tokenizer", IKTokenizerFactory.class);
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/IKAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.ik;
 2 | 
 3 | import org.elasticsearch.common.inject.Inject;
 4 | import org.elasticsearch.common.inject.assistedinject.Assisted;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.Index;
 8 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
 9 | import org.elasticsearch.index.analysis.ik.spi.Configuration;
10 | import org.elasticsearch.index.settings.IndexSettings;
11 | import org.wltea.analyzer.lucene.IKAnalyzer;
12 | 
13 | import java.util.Iterator;
14 | import java.util.ServiceLoader;
15 | 
16 | public class IKAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer> {
17 |     private final IKAnalyzer analyzer;
18 |     private ServiceLoader<Configuration> loader;
19 | 
20 |     @Inject
21 |     public IKAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
22 |         super(index, indexSettings, name, settings);
23 | 
24 |         loader = ServiceLoader.load(Configuration.class);
25 |         Iterator<Configuration> iterator = loader.iterator();
26 |         if (!iterator.hasNext()) {
27 |             throw new NotFoundIKAnalyzerConfigurationImplementation();
28 |         }
29 |         analyzer = new IKAnalyzer(iterator.next().init(index, indexSettings, env, name, settings));
30 |     }
31 | 
32 |     @Override
33 |     public IKAnalyzer get() {
34 |         return this.analyzer;
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/IKTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.ik;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.elasticsearch.common.inject.Inject;
 5 | import org.elasticsearch.common.inject.assistedinject.Assisted;
 6 | import org.elasticsearch.common.logging.ESLogger;
 7 | import org.elasticsearch.common.logging.ESLoggerFactory;
 8 | import org.elasticsearch.common.settings.Settings;
 9 | import org.elasticsearch.env.Environment;
10 | import org.elasticsearch.index.Index;
11 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
12 | import org.elasticsearch.index.analysis.ik.spi.Configuration;
13 | import org.elasticsearch.index.settings.IndexSettings;
14 | import org.wltea.analyzer.lucene.IKTokenizer;
15 | 
16 | import java.io.Reader;
17 | import java.util.Iterator;
18 | import java.util.ServiceLoader;
19 | 
20 | public class IKTokenizerFactory extends AbstractTokenizerFactory {
21 |     private final ESLogger logger = ESLoggerFactory.getLogger(IKTokenizerFactory.class.getName());
22 | 
23 |     private Configuration configuration;
24 |     private ServiceLoader<Configuration> loader;
25 | 
26 | 
27 |     @Inject
28 |     public IKTokenizerFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
29 |         super(index, indexSettings, name, settings);
30 |         loader = ServiceLoader.load(Configuration.class);
31 |         Iterator<Configuration> iterator = loader.iterator();
32 |         if (!iterator.hasNext()) {
33 |             logger.error("please provide the implementation of Configuration interface");
34 |             throw new NotFoundIKAnalyzerConfigurationImplementation();
35 |         }
36 | 
37 |         configuration = iterator.next();
38 |         configuration.init(index, indexSettings, env, name, settings);
39 | 
40 |     }
41 | 
42 |     @Override
43 |     public Tokenizer create(Reader reader) {
44 |         return new IKTokenizer(reader, configuration);
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/NotFoundIKAnalyzerConfigurationImplementation.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis.ik;
2 | 
3 | public class NotFoundIKAnalyzerConfigurationImplementation extends RuntimeException {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/java/org/elasticsearch/index/analysis/ik/spi/Configuration.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.ik.spi;
 2 | 
 3 | import org.elasticsearch.common.settings.Settings;
 4 | import org.elasticsearch.env.Environment;
 5 | import org.elasticsearch.index.Index;
 6 | import org.elasticsearch.index.settings.IndexSettings;
 7 | import org.wltea.analyzer.configuration.DictionaryConfiguration;
 8 | 
 9 | public interface Configuration extends DictionaryConfiguration {
10 |     Configuration init(Index index, @IndexSettings Settings indexSettings, Environment env, String name, Settings settings);
11 | }
12 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/java/org/elasticsearch/plugin/analyzer/ik/AnalysisIKPlugin.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.plugin.analyzer.ik;
 2 | 
 3 | import org.elasticsearch.common.inject.Module;
 4 | import org.elasticsearch.index.analysis.AnalysisModule;
 5 | import org.elasticsearch.index.analysis.ik.IKAnalysisBinderProcessor;
 6 | import org.elasticsearch.plugins.AbstractPlugin;
 7 | 
 8 | public class AnalysisIKPlugin extends AbstractPlugin {
 9 |     @Override
10 |     public String name() {
11 |         return "ik_analysis";
12 |     }
13 | 
14 |     @Override
15 |     public String description() {
16 |         return "IK Chinese analysis support";
17 |     }
18 | 
19 |     @Override public void processModule(Module module) {
20 |         if (module instanceof AnalysisModule) {
21 |             AnalysisModule analysisModule = (AnalysisModule) module;
22 |             analysisModule.addProcessor(new IKAnalysisBinderProcessor());
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.wltea.analyzer.lucene;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.analysis.Tokenizer;
 5 | import org.wltea.analyzer.configuration.DictionaryConfiguration;
 6 | 
 7 | import java.io.Reader;
 8 | 
 9 | public final class IKAnalyzer extends Analyzer {
10 | 
11 |     private DictionaryConfiguration configuration;
12 | 
13 |     public IKAnalyzer(DictionaryConfiguration configuration) {
14 |         super();
15 |         this.configuration = configuration;
16 |     }
17 |     @Override
18 |     protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
19 |         Tokenizer _IKTokenizer = new IKTokenizer(in, configuration);
20 |         return new TokenStreamComponents(_IKTokenizer);
21 |     }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * IK 中文分词  版本 5.0.1
  3 |  * IK Analyzer release 5.0.1
  4 |  *
  5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  *
 20 |  * 源代码由林良益(linliangyi2005@gmail.com)提供
 21 |  * 版权声明 2012，乌龙茶工作室
 22 |  * provided by Linliangyi and copyright 2012 by Oolong studio
 23 |  *
 24 | 
 25 |  *
 26 |  */
 27 | package org.wltea.analyzer.lucene;
 28 | 
 29 | import org.apache.lucene.analysis.Tokenizer;
 30 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 31 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 32 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 33 | import org.wltea.analyzer.configuration.DictionaryConfiguration;
 34 | import org.wltea.analyzer.core.IKSegmenter;
 35 | import org.wltea.analyzer.core.Lexeme;
 36 | 
 37 | import java.io.IOException;
 38 | import java.io.Reader;
 39 | 
 40 | 
 41 | /**
 42 |  * IK分词器 Lucene Tokenizer适配器类
 43 |  * 兼容Lucene 4.0版本
 44 |  */
 45 | public final class IKTokenizer extends Tokenizer {
 46 | 
 47 |     //IK分词器实现
 48 |     private IKSegmenter _IKImplement;
 49 | 
 50 |     //词元文本属性
 51 |     private final CharTermAttribute termAtt;
 52 |     //词元位移属性
 53 |     private final OffsetAttribute offsetAtt;
 54 |     //词元分类属性（该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量）
 55 |     private final TypeAttribute typeAtt;
 56 |     //记录最后一个词元的结束位置
 57 |     private int endPosition;
 58 | 
 59 |     public IKTokenizer(Reader in, DictionaryConfiguration configuration) {
 60 |         super(in);
 61 |         offsetAtt = addAttribute(OffsetAttribute.class);
 62 |         termAtt = addAttribute(CharTermAttribute.class);
 63 |         typeAtt = addAttribute(TypeAttribute.class);
 64 |         _IKImplement = new IKSegmenter(input, configuration);
 65 |     }
 66 | 
 67 |     /* (non-Javadoc)
 68 |      * @see org.apache.lucene.analysis.TokenStream#incrementToken()
 69 |      */
 70 |     @Override
 71 |     public boolean incrementToken() throws IOException {
 72 |         //清除所有的词元属性
 73 |         clearAttributes();
 74 |         Lexeme nextLexeme = _IKImplement.next();
 75 |         if (nextLexeme != null) {
 76 |             //将Lexeme转成Attributes
 77 |             //设置词元文本
 78 |             termAtt.append(nextLexeme.getLexemeText());
 79 |             //设置词元长度
 80 |             termAtt.setLength(nextLexeme.getLength());
 81 |             //设置词元位移
 82 |             offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
 83 |             //记录分词的最后位置
 84 |             endPosition = nextLexeme.getEndPosition();
 85 |             //记录词元分类
 86 |             typeAtt.setType(nextLexeme.getLexemeTypeString());
 87 |             //返会true告知还有下个词元
 88 |             return true;
 89 |         }
 90 |         //返会false告知词元输出完毕
 91 |         return false;
 92 |     }
 93 | 
 94 |     /*
 95 |      * (non-Javadoc)
 96 |      * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
 97 |      */
 98 |     @Override
 99 |     public void reset() throws IOException {
100 |         super.reset();
101 |         _IKImplement.reset(input);
102 |     }
103 | 
104 |     public final void end() {
105 |         // set final offset
106 |         int finalOffset = correctOffset(this.endPosition);
107 |         offsetAtt.setOffset(finalOffset, finalOffset);
108 |     }
109 | }
110 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/main/resources/es-plugin.properties:
--------------------------------------------------------------------------------
1 | plugin=org.elasticsearch.plugin.analyzer.ik.AnalysisIKPlugin
2 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/test/java/IkESPluginTest.java:
--------------------------------------------------------------------------------
 1 | import org.elasticsearch.Version;
 2 | import org.elasticsearch.cluster.metadata.IndexMetaData;
 3 | import org.elasticsearch.common.inject.Injector;
 4 | import org.elasticsearch.common.inject.ModulesBuilder;
 5 | import org.elasticsearch.common.settings.ImmutableSettings;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.common.settings.SettingsModule;
 8 | import org.elasticsearch.env.Environment;
 9 | import org.elasticsearch.env.EnvironmentModule;
10 | import org.elasticsearch.index.Index;
11 | import org.elasticsearch.index.IndexNameModule;
12 | import org.elasticsearch.index.analysis.AnalysisModule;
13 | import org.elasticsearch.index.analysis.AnalysisService;
14 | import org.elasticsearch.index.analysis.TokenizerFactory;
15 | import org.elasticsearch.index.analysis.ik.IKAnalysisBinderProcessor;
16 | import org.elasticsearch.index.analysis.ik.IKTokenizerFactory;
17 | import org.elasticsearch.index.settings.IndexSettingsModule;
18 | import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
19 | import org.elasticsearch.indices.analysis.IndicesAnalysisService;
20 | import org.elasticsearch.test.ElasticsearchTestCase;
21 | import org.hamcrest.MatcherAssert;
22 | import org.junit.Test;
23 | 
24 | import static org.hamcrest.Matchers.instanceOf;
25 | 
26 | 
27 | public class IkESPluginTest extends ElasticsearchTestCase {
28 | 
29 | 
30 |     @Test
31 |     public void testDefaultsIcuAnalysis() {
32 |         Index index = new Index("test");
33 | 
34 |         Settings settings = ImmutableSettings.settingsBuilder()
35 |                 .put("path.home", "none")
36 |                 .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
37 |                 .build();
38 | 
39 |         Injector parentInjector = new ModulesBuilder().add(new SettingsModule(ImmutableSettings.EMPTY), new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
40 |         Injector injector = new ModulesBuilder().add(
41 |                 new IndexSettingsModule(index, settings),
42 |                 new IndexNameModule(index),
43 |                 new AnalysisModule(ImmutableSettings.EMPTY, parentInjector.getInstance(IndicesAnalysisService.class)).addProcessor(new IKAnalysisBinderProcessor()))
44 |                 .createChildInjector(parentInjector);
45 | 
46 |         AnalysisService analysisService = injector.getInstance(AnalysisService.class);
47 | 
48 |         TokenizerFactory tokenizerFactory = analysisService.tokenizer("ik_tokenizer");
49 |         MatcherAssert.assertThat(tokenizerFactory, instanceOf(IKTokenizerFactory.class));
50 | 
51 | 
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/test/java/org/elasticsearch/index/analysis/ik/MockConfiguration.java:
--------------------------------------------------------------------------------
 1 | package org.elasticsearch.index.analysis.ik;
 2 | 
 3 | import org.elasticsearch.common.settings.Settings;
 4 | import org.elasticsearch.env.Environment;
 5 | import org.elasticsearch.index.Index;
 6 | import org.elasticsearch.index.analysis.ik.spi.Configuration;
 7 | import org.elasticsearch.index.settings.IndexSettings;
 8 | 
 9 | import java.util.Collections;
10 | import java.util.List;
11 | 
12 | public class MockConfiguration implements Configuration{
13 | 
14 | 
15 |     @Override
16 |     public Configuration init(Index index, @IndexSettings Settings indexSettings, Environment env, String name, Settings settings) {
17 | 
18 |         return this;
19 |     }
20 | 
21 |     @Override
22 |     public boolean isSmartMode() {
23 |         return false;
24 |     }
25 | 
26 |     @Override
27 |     public void setSmartMode(boolean useSmart) {
28 | 
29 |     }
30 | 
31 |     @Override
32 |     public List<char[]> getMainDictionary() {
33 |         return Collections.emptyList();
34 |     }
35 | 
36 |     @Override
37 |     public List<char[]> getStopWordDictionary() {
38 |         return Collections.emptyList();
39 |     }
40 | 
41 |     @Override
42 |     public List<char[]> getQuantifierDictionary() {
43 |         return Collections.emptyList();
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/ik-analysis-es-plugin/src/test/resources/META-INF/services/org.elasticsearch.index.analysis.ik.spi.Configuration:
--------------------------------------------------------------------------------
1 | org.elasticsearch.index.analysis.ik.MockConfiguration
2 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | include 'ik-analysis-core'
2 | 
3 | include 'ik-analysis-es-plugin'
4 | 
5 | include 'es-ik-sqlite3'
6 | 
7 | 


--------------------------------------------------------------------------------