├── .github
├── FUNDING.yml
└── workflows
│ └── release.yml
├── .gitignore
├── src
└── main
│ ├── resources
│ ├── plugin-security.policy
│ └── plugin-descriptor.properties
│ ├── java
│ └── org
│ │ ├── wltea
│ │ └── analyzer
│ │ │ ├── help
│ │ │ ├── ESPluginLoggerFactory.java
│ │ │ ├── Sleep.java
│ │ │ ├── CharacterHelper.java
│ │ │ └── PrefixPluginLogger.java
│ │ │ ├── core
│ │ │ ├── ISegmenter.java
│ │ │ ├── CharacterUtil.java
│ │ │ ├── CJKSegmenter.java
│ │ │ ├── IKSegmenter.java
│ │ │ ├── IKArbitrator.java
│ │ │ ├── QuickSortSet.java
│ │ │ ├── CN_QuantifierSegmenter.java
│ │ │ ├── LexemePath.java
│ │ │ ├── Lexeme.java
│ │ │ ├── LetterSegmenter.java
│ │ │ └── AnalyzeContext.java
│ │ │ ├── cfg
│ │ │ └── Configuration.java
│ │ │ ├── lucene
│ │ │ ├── IKAnalyzer.java
│ │ │ └── IKTokenizer.java
│ │ │ └── dic
│ │ │ ├── Hit.java
│ │ │ ├── Monitor.java
│ │ │ ├── DictSegment.java
│ │ │ └── Dictionary.java
│ │ └── elasticsearch
│ │ ├── index
│ │ └── analysis
│ │ │ ├── IkAnalyzerProvider.java
│ │ │ └── IkTokenizerFactory.java
│ │ └── plugin
│ │ └── analysis
│ │ └── ik
│ │ └── AnalysisIkPlugin.java
│ └── assemblies
│ └── plugin.xml
├── config
├── preposition.dic
├── extra_stopword.dic
├── suffix.dic
├── stopword.dic
├── IKAnalyzer.cfg.xml
├── surname.dic
├── quantifier.dic
└── extra_single_word_low_freq.dic
├── .travis.yml
├── release.sh
├── README.md
├── licenses
├── lucene-NOTICE.txt
└── lucene-LICENSE.txt
├── LICENSE.txt
└── pom.xml
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | patreon: medcl
2 | custom: ["https://www.buymeacoffee.com/medcl"]
3 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /data
2 | /work
3 | /logs
4 | /.idea
5 | /target
6 | /out
7 | .DS_Store
8 | *.iml
9 | !.travis.yml
10 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-security.policy:
--------------------------------------------------------------------------------
1 | grant {
2 | // needed because of the hot reload functionality
3 | permission java.net.SocketPermission "*", "connect,resolve";
4 | };
--------------------------------------------------------------------------------
/config/preposition.dic:
--------------------------------------------------------------------------------
1 | 不
2 | 也
3 | 了
4 | 仍
5 | 从
6 | 以
7 | 使
8 | 则
9 | 却
10 | 又
11 | 及
12 | 对
13 | 就
14 | 并
15 | 很
16 | 或
17 | 把
18 | 是
19 | 的
20 | 着
21 | 给
22 | 而
23 | 被
24 | 让
25 | 但
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 | jdk:
3 | - oraclejdk8
4 | install: true
5 | script:
6 | - sudo apt-get update && sudo apt-get install oracle-java8-installer
7 | - java -version
8 | language: java
9 | script: mvn clean package
10 |
--------------------------------------------------------------------------------
/config/extra_stopword.dic:
--------------------------------------------------------------------------------
1 | 也
2 | 了
3 | 仍
4 | 从
5 | 以
6 | 使
7 | 则
8 | 却
9 | 又
10 | 及
11 | 对
12 | 就
13 | 并
14 | 很
15 | 或
16 | 把
17 | 是
18 | 的
19 | 着
20 | 给
21 | 而
22 | 被
23 | 让
24 | 在
25 | 还
26 | 比
27 | 等
28 | 当
29 | 与
30 | 于
31 | 但
--------------------------------------------------------------------------------
/config/suffix.dic:
--------------------------------------------------------------------------------
1 | 乡
2 | 井
3 | 亭
4 | 党
5 | 区
6 | 厅
7 | 县
8 | 园
9 | 塔
10 | 家
11 | 寺
12 | 局
13 | 巷
14 | 市
15 | 弄
16 | 所
17 | 斯基
18 | 楼
19 | 江
20 | 河
21 | 海
22 | 湖
23 | 省
24 | 维奇
25 | 署
26 | 苑
27 | 街
28 | 觀
29 | 观
30 | 诺夫
31 | 路
32 | 部
33 | 镇
34 | 阁
35 | 山
36 | 子
37 | 娃
--------------------------------------------------------------------------------
/config/stopword.dic:
--------------------------------------------------------------------------------
1 | a
2 | an
3 | and
4 | are
5 | as
6 | at
7 | be
8 | but
9 | by
10 | for
11 | if
12 | in
13 | into
14 | is
15 | it
16 | no
17 | not
18 | of
19 | on
20 | or
21 | such
22 | that
23 | the
24 | their
25 | then
26 | there
27 | these
28 | they
29 | this
30 | to
31 | was
32 | will
33 | with
--------------------------------------------------------------------------------
/config/IKAnalyzer.cfg.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | IK Analyzer 扩展配置
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/release.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | set -e
3 |
4 | read -p "Enter the version number: " VERSION
5 |
6 | if [ $VERSION ]; then
7 | sed -i "s/.*<\/elasticsearch\.version>/$VERSION<\/elasticsearch\.version>/" pom.xml
8 | git add pom.xml
9 | git commit -m "Support version v$VERSION"
10 | git tag --annotate --message "v$VERSION" v$VERSION
11 | git push
12 | git push --tags
13 | fi
14 |
15 | # reset the version number
16 | # git reset --hard HEAD~1
17 | # git tag -d "v$VERSION"
18 | # git push -f
19 | # git push origin ":refs/tags/v$VERSION"
20 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release
2 |
3 | on:
4 | push:
5 | tags:
6 | - 'v*'
7 |
8 | jobs:
9 | build:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v3
13 | - name: Set up JDK 18
14 | uses: actions/setup-java@v3
15 | with:
16 | java-version: '18'
17 | distribution: 'temurin'
18 | cache: maven
19 | - name: Build with Maven
20 | run: mvn -B package --file pom.xml
21 |
22 | - name: Release
23 | uses: softprops/action-gh-release@v1
24 | with:
25 | files: target/releases/*.zip
26 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/help/ESPluginLoggerFactory.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.help;
2 |
3 | import org.apache.logging.log4j.LogManager;
4 | import org.apache.logging.log4j.Logger;
5 | import org.apache.logging.log4j.spi.ExtendedLogger;
6 |
7 | public class ESPluginLoggerFactory {
8 |
9 | private ESPluginLoggerFactory() {
10 | }
11 |
12 | static public Logger getLogger(String name) {
13 | return getLogger("", LogManager.getLogger(name));
14 | }
15 |
16 | static public Logger getLogger(String prefix, String name) {
17 | return getLogger(prefix, LogManager.getLogger(name));
18 | }
19 |
20 | static public Logger getLogger(String prefix, Class> clazz) {
21 | return getLogger(prefix, LogManager.getLogger(clazz.getName()));
22 | }
23 |
24 | static public Logger getLogger(String prefix, Logger logger) {
25 | return (Logger)(prefix != null && prefix.length() != 0 ? new PrefixPluginLogger((ExtendedLogger)logger, logger.getName(), prefix) : logger);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/config/surname.dic:
--------------------------------------------------------------------------------
1 | 丁
2 | 万
3 | 万俟
4 | 上官
5 | 东方
6 | 乔
7 | 于
8 | 令狐
9 | 仲孙
10 | 任
11 | 何
12 | 余
13 | 候
14 | 傅
15 | 公冶
16 | 公孙
17 | 公羊
18 | 冯
19 | 刘
20 | 单
21 | 单于
22 | 卢
23 | 史
24 | 叶
25 | 司徒
26 | 司空
27 | 司马
28 | 吕
29 | 吴
30 | 周
31 | 唐
32 | 夏
33 | 夏侯
34 | 太叔
35 | 姚
36 | 姜
37 | 孔
38 | 孙
39 | 孟
40 | 宇文
41 | 宋
42 | 宗政
43 | 尉迟
44 | 尹
45 | 崔
46 | 常
47 | 康
48 | 廖
49 | 张
50 | 彭
51 | 徐
52 | 慕容
53 | 戴
54 | 文
55 | 方
56 | 易
57 | 曹
58 | 曾
59 | 朱
60 | 李
61 | 杜
62 | 杨
63 | 林
64 | 梁
65 | 欧阳
66 | 武
67 | 段
68 | 毛
69 | 江
70 | 汤
71 | 沈
72 | 淳于
73 | 潘
74 | 澹台
75 | 濮阳
76 | 熊
77 | 王
78 | 田
79 | 申屠
80 | 白
81 | 皇甫
82 | 石
83 | 秦
84 | 程
85 | 罗
86 | 肖
87 | 胡
88 | 苏
89 | 范
90 | 董
91 | 蒋
92 | 薛
93 | 袁
94 | 许
95 | 诸葛
96 | 谢
97 | 谭
98 | 贺
99 | 贾
100 | 赖
101 | 赫连
102 | 赵
103 | 轩辕
104 | 邓
105 | 邱
106 | 邵
107 | 邹
108 | 郑
109 | 郝
110 | 郭
111 | 金
112 | 钟
113 | 钟离
114 | 钱
115 | 长孙
116 | 闻人
117 | 闾丘
118 | 阎
119 | 陆
120 | 陈
121 | 雷
122 | 韩
123 | 顾
124 | 马
125 | 高
126 | 魏
127 | 鲜于
128 | 黄
129 | 黎
130 | 龙
131 | 龚
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/help/Sleep.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.help;
2 |
3 | import org.apache.logging.log4j.Logger;
4 |
5 | public class Sleep {
6 |
7 | private static final Logger logger = ESPluginLoggerFactory.getLogger(Sleep.class.getName());
8 |
9 | public enum Type {MSEC, SEC, MIN, HOUR}
10 |
11 | ;
12 |
13 | public static void sleep(Type type, int num) {
14 | try {
15 | switch (type) {
16 | case MSEC:
17 | Thread.sleep(num);
18 | return;
19 | case SEC:
20 | Thread.sleep(num * 1000);
21 | return;
22 | case MIN:
23 | Thread.sleep(num * 60 * 1000);
24 | return;
25 | case HOUR:
26 | Thread.sleep(num * 60 * 60 * 1000);
27 | return;
28 | default:
29 | System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
30 | return;
31 | }
32 | } catch (InterruptedException e) {
33 | logger.error(e.getMessage(), e);
34 | }
35 | }
36 |
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.elasticsearch.common.settings.Settings;
4 | import org.elasticsearch.env.Environment;
5 | import org.elasticsearch.index.IndexSettings;
6 | import org.wltea.analyzer.cfg.Configuration;
7 | import org.wltea.analyzer.lucene.IKAnalyzer;
8 |
9 | public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider {
10 | private final IKAnalyzer analyzer;
11 |
12 | public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
13 | super(name, settings);
14 |
15 | Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);
16 |
17 | analyzer=new IKAnalyzer(configuration);
18 | }
19 |
20 | public static IkAnalyzerProvider getIkSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
21 | return new IkAnalyzerProvider(indexSettings,env,name,settings,true);
22 | }
23 |
24 | public static IkAnalyzerProvider getIkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
25 | return new IkAnalyzerProvider(indexSettings,env,name,settings,false);
26 | }
27 |
28 | @Override public IKAnalyzer get() {
29 | return this.analyzer;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/ISegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 |
28 | /**
29 | *
30 | * 子分词器接口
31 | */
32 | interface ISegmenter {
33 |
34 | /**
35 | * 从分析器读取下一个可能分解的词元对象
36 | * @param context 分词算法上下文
37 | */
38 | void analyze(AnalyzeContext context);
39 |
40 |
41 | /**
42 | * 重置子分析器状态
43 | */
44 | void reset();
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.apache.lucene.analysis.Tokenizer;
4 | import org.elasticsearch.common.settings.Settings;
5 | import org.elasticsearch.env.Environment;
6 | import org.elasticsearch.index.IndexSettings;
7 | import org.wltea.analyzer.cfg.Configuration;
8 | import org.wltea.analyzer.lucene.IKTokenizer;
9 |
10 | public class IkTokenizerFactory extends AbstractTokenizerFactory {
11 | private Configuration configuration;
12 |
13 | public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
14 | super(indexSettings, settings,name);
15 | configuration=new Configuration(env,settings);
16 | }
17 |
18 | public static IkTokenizerFactory getIkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
19 | return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false);
20 | }
21 |
22 | public static IkTokenizerFactory getIkSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
23 | return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(true);
24 | }
25 |
26 | public IkTokenizerFactory setSmart(boolean smart){
27 | this.configuration.setUseSmart(smart);
28 | return this;
29 | }
30 |
31 | @Override
32 | public Tokenizer create() {
33 | return new IKTokenizer(configuration); }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.analysis.ik;
2 |
3 | import org.apache.lucene.analysis.Analyzer;
4 | import org.elasticsearch.index.analysis.AnalyzerProvider;
5 | import org.elasticsearch.index.analysis.IkAnalyzerProvider;
6 | import org.elasticsearch.index.analysis.IkTokenizerFactory;
7 | import org.elasticsearch.index.analysis.TokenizerFactory;
8 | import org.elasticsearch.indices.analysis.AnalysisModule;
9 | import org.elasticsearch.plugins.AnalysisPlugin;
10 | import org.elasticsearch.plugins.Plugin;
11 |
12 | import java.util.HashMap;
13 | import java.util.Map;
14 |
15 |
16 | public class AnalysisIkPlugin extends Plugin implements AnalysisPlugin {
17 |
18 | public static String PLUGIN_NAME = "analysis-ik";
19 |
20 | @Override
21 | public Map> getTokenizers() {
22 | Map> extra = new HashMap<>();
23 |
24 |
25 | extra.put("ik_smart", IkTokenizerFactory::getIkSmartTokenizerFactory);
26 | extra.put("ik_max_word", IkTokenizerFactory::getIkTokenizerFactory);
27 |
28 | return extra;
29 | }
30 |
31 | @Override
32 | public Map>> getAnalyzers() {
33 | Map>> extra = new HashMap<>();
34 |
35 | extra.put("ik_smart", IkAnalyzerProvider::getIkSmartAnalyzerProvider);
36 | extra.put("ik_max_word", IkAnalyzerProvider::getIkAnalyzerProvider);
37 |
38 | return extra;
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | analysis-ik-release
4 |
5 | zip
6 |
7 | false
8 |
9 |
10 | ${project.basedir}/config
11 | config
12 |
13 |
14 |
15 |
16 |
17 | ${project.basedir}/src/main/resources/plugin-descriptor.properties
18 |
19 | true
20 |
21 |
22 | ${project.basedir}/src/main/resources/plugin-security.policy
23 |
24 | true
25 |
26 |
27 |
28 |
29 |
30 | true
31 | true
32 |
33 | org.elasticsearch:elasticsearch
34 |
35 |
36 |
37 |
38 | true
39 | true
40 |
41 | org.apache.httpcomponents:httpclient
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/help/CharacterHelper.java:
--------------------------------------------------------------------------------
1 |
2 | package org.wltea.analyzer.help;
3 |
4 | public class CharacterHelper {
5 |
6 | public static boolean isSpaceLetter(char input){
7 | return input == 8 || input == 9
8 | || input == 10 || input == 13
9 | || input == 32 || input == 160;
10 | }
11 |
12 | public static boolean isEnglishLetter(char input){
13 | return (input >= 'a' && input <= 'z')
14 | || (input >= 'A' && input <= 'Z');
15 | }
16 |
17 | public static boolean isArabicNumber(char input){
18 | return input >= '0' && input <= '9';
19 | }
20 |
21 | public static boolean isCJKCharacter(char input){
22 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
23 | if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
24 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
25 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
26 |
27 | || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
28 |
29 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
30 | || ub == Character.UnicodeBlock.HANGUL_JAMO
31 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
32 |
33 | || ub == Character.UnicodeBlock.HIRAGANA
34 | || ub == Character.UnicodeBlock.KATAKANA
35 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS
36 | ) {
37 | return true;
38 | }else{
39 | return false;
40 | }
41 |
42 |
43 |
44 | }
45 |
46 | public static char regularize(char input){
47 | if (input == 12288) {
48 | input = (char) 32;
49 |
50 | }else if (input > 65280 && input < 65375) {
51 | input = (char) (input - 65248);
52 |
53 | }else if (input >= 'A' && input <= 'Z') {
54 | input += 32;
55 | }
56 |
57 | return input;
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/help/PrefixPluginLogger.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.help;
2 |
3 | import org.apache.logging.log4j.Level;
4 | import org.apache.logging.log4j.Marker;
5 | import org.apache.logging.log4j.MarkerManager;
6 | import org.apache.logging.log4j.message.Message;
7 | import org.apache.logging.log4j.message.MessageFactory;
8 | import org.apache.logging.log4j.spi.ExtendedLogger;
9 | import org.apache.logging.log4j.spi.ExtendedLoggerWrapper;
10 |
11 | import java.util.WeakHashMap;
12 |
13 | public class PrefixPluginLogger extends ExtendedLoggerWrapper {
14 | private static final WeakHashMap markers = new WeakHashMap();
15 | private final Marker marker;
16 |
17 | static int markersSize() {
18 | return markers.size();
19 | }
20 |
21 | public String prefix() {
22 | return this.marker.getName();
23 | }
24 |
25 | PrefixPluginLogger(ExtendedLogger logger, String name, String prefix) {
26 | super(logger, name, (MessageFactory) null);
27 | String actualPrefix = prefix == null ? "" : prefix;
28 | WeakHashMap var6 = markers;
29 | MarkerManager.Log4jMarker actualMarker;
30 | synchronized (markers) {
31 | MarkerManager.Log4jMarker maybeMarker = (MarkerManager.Log4jMarker) markers.get(actualPrefix);
32 | if (maybeMarker == null) {
33 | actualMarker = new MarkerManager.Log4jMarker(actualPrefix);
34 | markers.put(new String(actualPrefix), actualMarker);
35 | } else {
36 | actualMarker = maybeMarker;
37 | }
38 | }
39 |
40 | this.marker = (Marker) actualMarker;
41 | }
42 |
43 | public void logMessage(String fqcn, Level level, Marker marker, Message message, Throwable t) {
44 | assert marker == null;
45 |
46 | super.logMessage(fqcn, level, this.marker, message, t);
47 | }
48 | }
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/cfg/Configuration.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package org.wltea.analyzer.cfg;
5 |
6 | import org.elasticsearch.common.inject.Inject;
7 | import org.elasticsearch.core.PathUtils;
8 | import org.elasticsearch.common.settings.Settings;
9 | import org.elasticsearch.env.Environment;
10 | import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
11 | import org.wltea.analyzer.dic.Dictionary;
12 |
13 | import java.io.File;
14 | import java.nio.file.Path;
15 |
16 | public class Configuration {
17 |
18 | private Environment environment;
19 | private Settings settings;
20 |
21 | //是否启用智能分词
22 | private boolean useSmart;
23 |
24 | //是否启用远程词典加载
25 | private boolean enableRemoteDict=false;
26 |
27 | //是否启用小写处理
28 | private boolean enableLowercase=true;
29 |
30 |
31 | @Inject
32 | public Configuration(Environment env,Settings settings) {
33 | this.environment = env;
34 | this.settings=settings;
35 |
36 | this.useSmart = settings.get("use_smart", "false").equals("true");
37 | this.enableLowercase = settings.get("enable_lowercase", "true").equals("true");
38 | this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
39 |
40 | Dictionary.initial(this);
41 |
42 | }
43 |
44 | public Path getConfigInPluginDir() {
45 | return PathUtils
46 | .get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath())
47 | .getParent(), "config")
48 | .toAbsolutePath();
49 | }
50 |
51 | public boolean isUseSmart() {
52 | return useSmart;
53 | }
54 |
55 | public Configuration setUseSmart(boolean useSmart) {
56 | this.useSmart = useSmart;
57 | return this;
58 | }
59 |
60 | public Environment getEnvironment() {
61 | return environment;
62 | }
63 |
64 | public Settings getSettings() {
65 | return settings;
66 | }
67 |
68 | public boolean isEnableRemoteDict() {
69 | return enableRemoteDict;
70 | }
71 |
72 | public boolean isEnableLowercase() {
73 | return enableLowercase;
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | # Elasticsearch plugin descriptor file
2 | # This file must exist as 'plugin-descriptor.properties' at
3 | # the root directory of all plugins.
4 | #
5 | # A plugin can be 'site', 'jvm', or both.
6 | #
7 | ### example site plugin for "foo":
8 | #
9 | # foo.zip <-- zip file for the plugin, with this structure:
10 | # _site/ <-- the contents that will be served
11 | # plugin-descriptor.properties <-- example contents below:
12 | #
13 | # site=true
14 | # description=My cool plugin
15 | # version=1.0
16 | #
17 | ### example jvm plugin for "foo"
18 | #
19 | # foo.zip <-- zip file for the plugin, with this structure:
20 | # .jar <-- classes, resources, dependencies
21 | # .jar <-- any number of jars
22 | # plugin-descriptor.properties <-- example contents below:
23 | #
24 | # jvm=true
25 | # classname=foo.bar.BazPlugin
26 | # description=My cool plugin
27 | # version=2.0.0-rc1
28 | # elasticsearch.version=2.0
29 | # java.version=1.7
30 | #
31 | ### mandatory elements for all plugins:
32 | #
33 | # 'description': simple summary of the plugin
34 | description=${project.description}
35 | #
36 | # 'version': plugin's version
37 | version=${project.version}
38 | #
39 | # 'name': the plugin name
40 | name=${elasticsearch.plugin.name}
41 | #
42 | # 'classname': the name of the class to load, fully-qualified.
43 | classname=${elasticsearch.plugin.classname}
44 | #
45 | # 'java.version' version of java the code is built against
46 | # use the system property java.specification.version
47 | # version string must be a sequence of nonnegative decimal integers
48 | # separated by "."'s and may have leading zeros
49 | java.version=${maven.compiler.target}
50 | #
51 | # 'elasticsearch.version' version of elasticsearch compiled against
52 | # You will have to release a new version of the plugin for each new
53 | # elasticsearch release. This version is checked when the plugin
54 | # is loaded so Elasticsearch will refuse to start in the presence of
55 | # plugins with the incorrect elasticsearch.version.
56 | elasticsearch.version=${elasticsearch.version}
57 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.lucene;
26 |
27 | import org.apache.lucene.analysis.Analyzer;
28 | import org.apache.lucene.analysis.Tokenizer;
29 | import org.wltea.analyzer.cfg.Configuration;
30 |
31 | /**
32 | * IK分词器,Lucene Analyzer接口实现
33 | * 兼容Lucene 4.0版本
34 | */
35 | public final class IKAnalyzer extends Analyzer{
36 |
37 | private Configuration configuration;
38 |
39 | /**
40 | * IK分词器Lucene Analyzer接口实现类
41 | *
42 | * 默认细粒度切分算法
43 | */
44 | public IKAnalyzer(){
45 | }
46 |
47 | /**
48 | * IK分词器Lucene Analyzer接口实现类
49 | *
50 | * @param configuration IK配置
51 | */
52 | public IKAnalyzer(Configuration configuration){
53 | super();
54 | this.configuration = configuration;
55 | }
56 |
57 |
58 | /**
59 | * 重载Analyzer接口,构造分词组件
60 | */
61 | @Override
62 | protected TokenStreamComponents createComponents(String fieldName) {
63 | Tokenizer _IKTokenizer = new IKTokenizer(configuration);
64 | return new TokenStreamComponents(_IKTokenizer);
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/config/quantifier.dic:
--------------------------------------------------------------------------------
1 | 丈
2 | 下
3 | 世
4 | 世纪
5 | 两
6 | 个
7 | 中
8 | 串
9 | 亩
10 | 人
11 | 介
12 | 付
13 | 代
14 | 件
15 | 任
16 | 份
17 | 伏
18 | 伙
19 | 位
20 | 位数
21 | 例
22 | 倍
23 | 像素
24 | 元
25 | 克
26 | 克拉
27 | 公亩
28 | 公克
29 | 公分
30 | 公升
31 | 公尺
32 | 公担
33 | 公斤
34 | 公里
35 | 公顷
36 | 具
37 | 册
38 | 出
39 | 刀
40 | 分
41 | 分钟
42 | 分米
43 | 划
44 | 列
45 | 则
46 | 刻
47 | 剂
48 | 剑
49 | 副
50 | 加仑
51 | 勺
52 | 包
53 | 匙
54 | 匹
55 | 区
56 | 千克
57 | 千米
58 | 升
59 | 卷
60 | 厅
61 | 厘
62 | 厘米
63 | 双
64 | 发
65 | 口
66 | 句
67 | 只
68 | 台
69 | 叶
70 | 号
71 | 名
72 | 吨
73 | 听
74 | 员
75 | 周
76 | 周年
77 | 品
78 | 回
79 | 团
80 | 圆
81 | 圈
82 | 地
83 | 场
84 | 块
85 | 坪
86 | 堆
87 | 声
88 | 壶
89 | 处
90 | 夜
91 | 大
92 | 天
93 | 头
94 | 套
95 | 女
96 | 孔
97 | 字
98 | 宗
99 | 室
100 | 家
101 | 寸
102 | 对
103 | 封
104 | 尊
105 | 小时
106 | 尺
107 | 尾
108 | 局
109 | 层
110 | 届
111 | 岁
112 | 师
113 | 帧
114 | 幅
115 | 幕
116 | 幢
117 | 平方
118 | 平方公尺
119 | 平方公里
120 | 平方分米
121 | 平方厘米
122 | 平方码
123 | 平方米
124 | 平方英寸
125 | 平方英尺
126 | 平方英里
127 | 平米
128 | 年
129 | 年代
130 | 年级
131 | 度
132 | 座
133 | 式
134 | 引
135 | 张
136 | 成
137 | 战
138 | 截
139 | 户
140 | 房
141 | 所
142 | 扇
143 | 手
144 | 打
145 | 批
146 | 把
147 | 折
148 | 担
149 | 拍
150 | 招
151 | 拨
152 | 拳
153 | 指
154 | 掌
155 | 排
156 | 撮
157 | 支
158 | 文
159 | 斗
160 | 斤
161 | 方
162 | 族
163 | 日
164 | 时
165 | 曲
166 | 月
167 | 月份
168 | 期
169 | 本
170 | 朵
171 | 村
172 | 束
173 | 条
174 | 来
175 | 杯
176 | 枚
177 | 枝
178 | 枪
179 | 架
180 | 柄
181 | 柜
182 | 栋
183 | 栏
184 | 株
185 | 样
186 | 根
187 | 格
188 | 案
189 | 桌
190 | 档
191 | 桩
192 | 桶
193 | 梯
194 | 棵
195 | 楼
196 | 次
197 | 款
198 | 步
199 | 段
200 | 毛
201 | 毫
202 | 毫升
203 | 毫米
204 | 毫克
205 | 池
206 | 洲
207 | 派
208 | 海里
209 | 滴
210 | 炮
211 | 点
212 | 点钟
213 | 片
214 | 版
215 | 环
216 | 班
217 | 瓣
218 | 瓶
219 | 生
220 | 男
221 | 画
222 | 界
223 | 盆
224 | 盎司
225 | 盏
226 | 盒
227 | 盘
228 | 相
229 | 眼
230 | 石
231 | 码
232 | 碗
233 | 碟
234 | 磅
235 | 种
236 | 科
237 | 秒
238 | 秒钟
239 | 窝
240 | 立方公尺
241 | 立方分米
242 | 立方厘米
243 | 立方码
244 | 立方米
245 | 立方英寸
246 | 立方英尺
247 | 站
248 | 章
249 | 笔
250 | 等
251 | 筐
252 | 筒
253 | 箱
254 | 篇
255 | 篓
256 | 篮
257 | 簇
258 | 米
259 | 类
260 | 粒
261 | 级
262 | 组
263 | 维
264 | 缕
265 | 缸
266 | 罐
267 | 网
268 | 群
269 | 股
270 | 脚
271 | 船
272 | 艇
273 | 艘
274 | 色
275 | 节
276 | 英亩
277 | 英寸
278 | 英尺
279 | 英里
280 | 行
281 | 袋
282 | 角
283 | 言
284 | 课
285 | 起
286 | 趟
287 | 路
288 | 车
289 | 转
290 | 轮
291 | 辆
292 | 辈
293 | 连
294 | 通
295 | 遍
296 | 部
297 | 里
298 | 重
299 | 针
300 | 钟
301 | 钱
302 | 锅
303 | 门
304 | 间
305 | 队
306 | 阶段
307 | 隅
308 | 集
309 | 页
310 | 顶
311 | 顷
312 | 项
313 | 顿
314 | 颗
315 | 餐
316 | 首
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/Hit.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | /**
29 | * 表示一次词典匹配的命中
30 | */
31 | public class Hit {
32 | //Hit不匹配
33 | private static final int UNMATCH = 0x00000000;
34 | //Hit完全匹配
35 | private static final int MATCH = 0x00000001;
36 | //Hit前缀匹配
37 | private static final int PREFIX = 0x00000010;
38 |
39 |
40 | //该HIT当前状态,默认未匹配
41 | private int hitState = UNMATCH;
42 |
43 | //记录词典匹配过程中,当前匹配到的词典分支节点
44 | private DictSegment matchedDictSegment;
45 | /*
46 | * 词段开始位置
47 | */
48 | private int begin;
49 | /*
50 | * 词段的结束位置
51 | */
52 | private int end;
53 |
54 |
55 | /**
56 | * 判断是否完全匹配
57 | */
58 | public boolean isMatch() {
59 | return (this.hitState & MATCH) > 0;
60 | }
61 | /**
62 | *
63 | */
64 | public void setMatch() {
65 | this.hitState = this.hitState | MATCH;
66 | }
67 |
68 | /**
69 | * 判断是否是词的前缀
70 | */
71 | public boolean isPrefix() {
72 | return (this.hitState & PREFIX) > 0;
73 | }
74 | /**
75 | *
76 | */
77 | public void setPrefix() {
78 | this.hitState = this.hitState | PREFIX;
79 | }
80 | /**
81 | * 判断是否是不匹配
82 | */
83 | public boolean isUnmatch() {
84 | return this.hitState == UNMATCH ;
85 | }
86 | /**
87 | *
88 | */
89 | public void setUnmatch() {
90 | this.hitState = UNMATCH;
91 | }
92 |
93 | public DictSegment getMatchedDictSegment() {
94 | return matchedDictSegment;
95 | }
96 |
97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) {
98 | this.matchedDictSegment = matchedDictSegment;
99 | }
100 |
101 | public int getBegin() {
102 | return begin;
103 | }
104 |
105 | public void setBegin(int begin) {
106 | this.begin = begin;
107 | }
108 |
109 | public int getEnd() {
110 | return end;
111 | }
112 |
113 | public void setEnd(int end) {
114 | this.end = end;
115 | }
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/Monitor.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.dic;
2 |
3 | import java.io.IOException;
4 | import java.security.AccessController;
5 | import java.security.PrivilegedAction;
6 |
7 | import org.apache.http.client.config.RequestConfig;
8 | import org.apache.http.client.methods.CloseableHttpResponse;
9 | import org.apache.http.client.methods.HttpHead;
10 | import org.apache.http.impl.client.CloseableHttpClient;
11 | import org.apache.http.impl.client.HttpClients;
12 | import org.apache.logging.log4j.Logger;
13 | import org.elasticsearch.SpecialPermission;
14 | import org.wltea.analyzer.help.ESPluginLoggerFactory;
15 |
16 | public class Monitor implements Runnable {
17 |
18 | private static final Logger logger = ESPluginLoggerFactory.getLogger(Monitor.class.getName());
19 |
20 | private static CloseableHttpClient httpclient = HttpClients.createDefault();
21 | /*
22 | * 上次更改时间
23 | */
24 | private String last_modified;
25 | /*
26 | * 资源属性
27 | */
28 | private String eTags;
29 |
30 | /*
31 | * 请求地址
32 | */
33 | private String location;
34 |
35 | public Monitor(String location) {
36 | this.location = location;
37 | this.last_modified = null;
38 | this.eTags = null;
39 | }
40 |
41 | public void run() {
42 | SpecialPermission.check();
43 | AccessController.doPrivileged((PrivilegedAction) () -> {
44 | this.runUnprivileged();
45 | return null;
46 | });
47 | }
48 |
49 | /**
50 | * 监控流程:
51 | * ①向词库服务器发送Head请求
52 | * ②从响应中获取Last-Modify、ETags字段值,判断是否变化
53 | * ③如果未变化,休眠1min,返回第①步
54 | * ④如果有变化,重新加载词典
55 | * ⑤休眠1min,返回第①步
56 | */
57 |
58 | public void runUnprivileged() {
59 |
60 | //超时设置
61 | RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
62 | .setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();
63 |
64 | HttpHead head = new HttpHead(location);
65 | head.setConfig(rc);
66 |
67 | //设置请求头
68 | if (last_modified != null) {
69 | head.setHeader("If-Modified-Since", last_modified);
70 | }
71 | if (eTags != null) {
72 | head.setHeader("If-None-Match", eTags);
73 | }
74 |
75 | CloseableHttpResponse response = null;
76 | try {
77 |
78 | response = httpclient.execute(head);
79 |
80 | //返回200 才做操作
81 | if(response.getStatusLine().getStatusCode()==200){
82 |
83 | if (((response.getLastHeader("Last-Modified")!=null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified))
84 | ||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(eTags))) {
85 |
86 | // 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
87 | Dictionary.getSingleton().reLoadMainDict();
88 | last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue();
89 | eTags = response.getLastHeader("ETag")==null?null:response.getLastHeader("ETag").getValue();
90 | }
91 | }else if (response.getStatusLine().getStatusCode()==304) {
92 | //没有修改,不做操作
93 | //noop
94 | }else{
95 | logger.info("remote_ext_dict {} return bad code {}" , location , response.getStatusLine().getStatusCode() );
96 | }
97 |
98 | } catch (Exception e) {
99 | logger.error("remote_ext_dict {} error!",e , location);
100 | }finally{
101 | try {
102 | if (response != null) {
103 | response.close();
104 | }
105 | } catch (IOException e) {
106 | logger.error(e.getMessage(), e);
107 | }
108 | }
109 | }
110 |
111 | }
112 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/CharacterUtil.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | * 字符集识别工具类
25 | */
26 | package org.wltea.analyzer.core;
27 |
28 | /**
29 | *
30 | * 字符集识别工具类
31 | */
32 | class CharacterUtil {
33 |
34 | public static final int CHAR_USELESS = 0;
35 |
36 | public static final int CHAR_ARABIC = 0X00000001;
37 |
38 | public static final int CHAR_ENGLISH = 0X00000002;
39 |
40 | public static final int CHAR_CHINESE = 0X00000004;
41 |
42 | public static final int CHAR_OTHER_CJK = 0X00000008;
43 |
44 |
45 | /**
46 | * 识别字符类型
47 | * @param input
48 | * @return int CharacterUtil定义的字符类型常量
49 | */
50 | static int identifyCharType(char input){
51 | if(input >= '0' && input <= '9'){
52 | return CHAR_ARABIC;
53 |
54 | }else if((input >= 'a' && input <= 'z')
55 | || (input >= 'A' && input <= 'Z')){
56 | return CHAR_ENGLISH;
57 |
58 | }else {
59 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
60 |
61 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
62 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
63 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
64 | //目前已知的中文字符UTF-8集合
65 | return CHAR_CHINESE;
66 |
67 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
68 | //韩文字符集
69 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
70 | || ub == Character.UnicodeBlock.HANGUL_JAMO
71 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
72 | //日文字符集
73 | || ub == Character.UnicodeBlock.HIRAGANA //平假名
74 | || ub == Character.UnicodeBlock.KATAKANA //片假名
75 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
76 | return CHAR_OTHER_CJK;
77 |
78 | }
79 | }
80 | //其他的不做处理的字符
81 | return CHAR_USELESS;
82 | }
83 |
84 | /**
85 | * 进行字符规格化(全角转半角,大写转小写处理)
86 | * @param input
87 | * @return char
88 | */
89 | static char regularize(char input,boolean lowercase){
90 | if (input == 12288) {
91 | input = (char) 32;
92 |
93 | }else if (input > 65280 && input < 65375) {
94 | input = (char) (input - 65248);
95 |
96 | }else if (input >= 'A' && input <= 'Z' && lowercase) {
97 | input += 32;
98 | }
99 |
100 | return input;
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java:
--------------------------------------------------------------------------------
1 |
2 | /**
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.core;
27 |
28 | import org.wltea.analyzer.dic.Dictionary;
29 | import org.wltea.analyzer.dic.Hit;
30 |
31 | import java.util.LinkedList;
32 | import java.util.List;
33 |
34 |
35 | /**
36 | * 中文-日韩文子分词器
37 | */
38 | class CJKSegmenter implements ISegmenter {
39 |
40 | //子分词器标签
41 | static final String SEGMENTER_NAME = "CJK_SEGMENTER";
42 | //待处理的分词hit队列
43 | private List tmpHits;
44 |
45 |
46 | CJKSegmenter(){
47 | this.tmpHits = new LinkedList();
48 | }
49 |
50 | /* (non-Javadoc)
51 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
52 | */
53 | public void analyze(AnalyzeContext context) {
54 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
55 |
56 | //优先处理tmpHits中的hit
57 | if(!this.tmpHits.isEmpty()){
58 | //处理词段队列
59 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
60 | for(Hit hit : tmpArray){
61 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
62 | if(hit.isMatch()){
63 | //输出当前的词
64 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
65 | context.addLexeme(newLexeme);
66 |
67 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
68 | this.tmpHits.remove(hit);
69 | }
70 |
71 | }else if(hit.isUnmatch()){
72 | //hit不是词,移除
73 | this.tmpHits.remove(hit);
74 | }
75 | }
76 | }
77 |
78 | //*********************************
79 | //再对当前指针位置的字符进行单字匹配
80 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
81 | if(singleCharHit.isMatch()){//首字成词
82 | //输出当前的词
83 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
84 | context.addLexeme(newLexeme);
85 |
86 | //同时也是词前缀
87 | if(singleCharHit.isPrefix()){
88 | //前缀匹配则放入hit列表
89 | this.tmpHits.add(singleCharHit);
90 | }
91 | }else if(singleCharHit.isPrefix()){//首字为词前缀
92 | //前缀匹配则放入hit列表
93 | this.tmpHits.add(singleCharHit);
94 | }
95 |
96 |
97 | }else{
98 | //遇到CHAR_USELESS字符
99 | //清空队列
100 | this.tmpHits.clear();
101 | }
102 |
103 | //判断缓冲区是否已经读完
104 | if(context.isBufferConsumed()){
105 | //清空队列
106 | this.tmpHits.clear();
107 | }
108 |
109 | //判断是否锁定缓冲区
110 | if(this.tmpHits.size() == 0){
111 | context.unlockBuffer(SEGMENTER_NAME);
112 |
113 | }else{
114 | context.lockBuffer(SEGMENTER_NAME);
115 | }
116 | }
117 |
118 | /* (non-Javadoc)
119 | * @see org.wltea.analyzer.core.ISegmenter#reset()
120 | */
121 | public void reset() {
122 | //清空队列
123 | this.tmpHits.clear();
124 | }
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/IKSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | */
24 | package org.wltea.analyzer.core;
25 |
26 | import org.wltea.analyzer.cfg.Configuration;
27 |
28 | import java.io.IOException;
29 | import java.io.Reader;
30 | import java.util.ArrayList;
31 | import java.util.List;
32 |
33 | /**
34 | * IK分词器主类
35 | *
36 | */
37 | public final class IKSegmenter {
38 |
39 | //字符窜reader
40 | private Reader input;
41 | //分词器上下文
42 | private AnalyzeContext context;
43 | //分词处理器列表
44 | private List segmenters;
45 | //分词歧义裁决器
46 | private IKArbitrator arbitrator;
47 | private Configuration configuration;
48 |
49 |
50 | /**
51 | * IK分词器构造函数
52 | * @param input
53 | */
54 | public IKSegmenter(Reader input ,Configuration configuration){
55 | this.input = input;
56 | this.configuration = configuration;
57 | this.init();
58 | }
59 |
60 |
61 | /**
62 | * 初始化
63 | */
64 | private void init(){
65 | //初始化分词上下文
66 | this.context = new AnalyzeContext(configuration);
67 | //加载子分词器
68 | this.segmenters = this.loadSegmenters();
69 | //加载歧义裁决器
70 | this.arbitrator = new IKArbitrator();
71 | }
72 |
73 | /**
74 | * 初始化词典,加载子分词器实现
75 | * @return List
76 | */
77 | private List loadSegmenters(){
78 | List segmenters = new ArrayList(4);
79 | //处理字母的子分词器
80 | segmenters.add(new LetterSegmenter());
81 | //处理中文数量词的子分词器
82 | segmenters.add(new CN_QuantifierSegmenter());
83 | //处理中文词的子分词器
84 | segmenters.add(new CJKSegmenter());
85 | return segmenters;
86 | }
87 |
88 | /**
89 | * 分词,获取下一个词元
90 | * @return Lexeme 词元对象
91 | * @throws java.io.IOException
92 | */
93 | public synchronized Lexeme next()throws IOException{
94 | Lexeme l = null;
95 | while((l = context.getNextLexeme()) == null ){
96 | /*
97 | * 从reader中读取数据,填充buffer
98 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理
99 | * 移位处理上次读入的但未处理的数据
100 | */
101 | int available = context.fillBuffer(this.input);
102 | if(available <= 0){
103 | //reader已经读完
104 | context.reset();
105 | return null;
106 |
107 | }else{
108 | //初始化指针
109 | context.initCursor();
110 | do{
111 | //遍历子分词器
112 | for(ISegmenter segmenter : segmenters){
113 | segmenter.analyze(context);
114 | }
115 | //字符缓冲区接近读完,需要读入新的字符
116 | if(context.needRefillBuffer()){
117 | break;
118 | }
119 | //向前移动指针
120 | }while(context.moveCursor());
121 | //重置子分词器,为下轮循环进行初始化
122 | for(ISegmenter segmenter : segmenters){
123 | segmenter.reset();
124 | }
125 | }
126 | //对分词进行歧义处理
127 | this.arbitrator.process(context, configuration.isUseSmart());
128 | //将分词结果输出到结果集,并处理未切分的单个CJK字符
129 | context.outputToResult();
130 | //记录本次分词的缓冲区位移
131 | context.markBufferOffset();
132 | }
133 | return l;
134 | }
135 |
136 | /**
137 | * 重置分词器到初始状态
138 | * @param input
139 | */
140 | public synchronized void reset(Reader input) {
141 | this.input = input;
142 | context.reset();
143 | for(ISegmenter segmenter : segmenters){
144 | segmenter.reset();
145 | }
146 | }
147 | }
148 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 |
25 | *
26 | */
27 | package org.wltea.analyzer.lucene;
28 |
29 | import org.apache.lucene.analysis.Tokenizer;
30 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
32 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
33 | import org.elasticsearch.common.settings.Settings;
34 | import org.elasticsearch.env.Environment;
35 | import org.wltea.analyzer.cfg.Configuration;
36 | import org.wltea.analyzer.core.IKSegmenter;
37 | import org.wltea.analyzer.core.Lexeme;
38 |
39 | import java.io.IOException;
40 | import java.io.Reader;
41 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
42 |
43 | /**
44 | * IK分词器 Lucene Tokenizer适配器类
45 | * 兼容Lucene 4.0版本
46 | */
47 | public final class IKTokenizer extends Tokenizer {
48 |
49 | //IK分词器实现
50 | private IKSegmenter _IKImplement;
51 |
52 | //词元文本属性
53 | private final CharTermAttribute termAtt;
54 | //词元位移属性
55 | private final OffsetAttribute offsetAtt;
56 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
57 | private final TypeAttribute typeAtt;
58 | //记录最后一个词元的结束位置
59 | private int endPosition;
60 |
61 | private int skippedPositions;
62 |
63 | private PositionIncrementAttribute posIncrAtt;
64 |
65 |
66 | /**
67 | * Lucene 4.0 Tokenizer适配器类构造函数
68 | */
69 | public IKTokenizer(Configuration configuration){
70 | super();
71 | offsetAtt = addAttribute(OffsetAttribute.class);
72 | termAtt = addAttribute(CharTermAttribute.class);
73 | typeAtt = addAttribute(TypeAttribute.class);
74 | posIncrAtt = addAttribute(PositionIncrementAttribute.class);
75 |
76 | _IKImplement = new IKSegmenter(input,configuration);
77 | }
78 |
79 | /* (non-Javadoc)
80 | * @see org.apache.lucene.analysis.TokenStream#incrementToken()
81 | */
82 | @Override
83 | public boolean incrementToken() throws IOException {
84 | //清除所有的词元属性
85 | clearAttributes();
86 | skippedPositions = 0;
87 |
88 | Lexeme nextLexeme = _IKImplement.next();
89 | if(nextLexeme != null){
90 | posIncrAtt.setPositionIncrement(skippedPositions +1 );
91 |
92 | //将Lexeme转成Attributes
93 | //设置词元文本
94 | termAtt.append(nextLexeme.getLexemeText());
95 | //设置词元长度
96 | termAtt.setLength(nextLexeme.getLength());
97 | //设置词元位移
98 | offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
99 |
100 | //记录分词的最后位置
101 | endPosition = nextLexeme.getEndPosition();
102 | //记录词元分类
103 | typeAtt.setType(nextLexeme.getLexemeTypeString());
104 | //返会true告知还有下个词元
105 | return true;
106 | }
107 | //返会false告知词元输出完毕
108 | return false;
109 | }
110 |
111 | /*
112 | * (non-Javadoc)
113 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
114 | */
115 | @Override
116 | public void reset() throws IOException {
117 | super.reset();
118 | _IKImplement.reset(input);
119 | skippedPositions = 0;
120 | }
121 |
122 | @Override
123 | public final void end() throws IOException {
124 | super.end();
125 | // set final offset
126 | int finalOffset = correctOffset(this.endPosition);
127 | offsetAtt.setOffset(finalOffset, finalOffset);
128 | posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/IKArbitrator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.Stack;
28 | import java.util.TreeSet;
29 |
30 | /**
31 | * IK分词歧义裁决器
32 | */
33 | class IKArbitrator {
34 |
35 | IKArbitrator(){
36 |
37 | }
38 |
39 | /**
40 | * 分词歧义处理
41 | // * @param orgLexemes
42 | * @param useSmart
43 | */
44 | void process(AnalyzeContext context , boolean useSmart){
45 | QuickSortSet orgLexemes = context.getOrgLexemes();
46 | Lexeme orgLexeme = orgLexemes.pollFirst();
47 |
48 | LexemePath crossPath = new LexemePath();
49 | while(orgLexeme != null){
50 | if(!crossPath.addCrossLexeme(orgLexeme)){
51 | //找到与crossPath不相交的下一个crossPath
52 | if(crossPath.size() == 1 || !useSmart){
53 | //crossPath没有歧义 或者 不做歧义处理
54 | //直接输出当前crossPath
55 | context.addLexemePath(crossPath);
56 | }else{
57 | //对当前的crossPath进行歧义处理
58 | QuickSortSet.Cell headCell = crossPath.getHead();
59 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
60 | //输出歧义处理结果judgeResult
61 | context.addLexemePath(judgeResult);
62 | }
63 |
64 | //把orgLexeme加入新的crossPath中
65 | crossPath = new LexemePath();
66 | crossPath.addCrossLexeme(orgLexeme);
67 | }
68 | orgLexeme = orgLexemes.pollFirst();
69 | }
70 |
71 |
72 | //处理最后的path
73 | if(crossPath.size() == 1 || !useSmart){
74 | //crossPath没有歧义 或者 不做歧义处理
75 | //直接输出当前crossPath
76 | context.addLexemePath(crossPath);
77 | }else{
78 | //对当前的crossPath进行歧义处理
79 | QuickSortSet.Cell headCell = crossPath.getHead();
80 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
81 | //输出歧义处理结果judgeResult
82 | context.addLexemePath(judgeResult);
83 | }
84 | }
85 |
86 | /**
87 | * 歧义识别
88 | * @param lexemeCell 歧义路径链表头
89 | * @param fullTextLength 歧义路径文本长度
90 | * @return
91 | */
92 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
93 | //候选路径集合
94 | TreeSet pathOptions = new TreeSet();
95 | //候选结果路径
96 | LexemePath option = new LexemePath();
97 |
98 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
99 | Stack lexemeStack = this.forwardPath(lexemeCell , option);
100 |
101 | //当前词元链并非最理想的,加入候选路径集合
102 | pathOptions.add(option.copy());
103 |
104 | //存在歧义词,处理
105 | QuickSortSet.Cell c = null;
106 | while(!lexemeStack.isEmpty()){
107 | c = lexemeStack.pop();
108 | //回滚词元链
109 | this.backPath(c.getLexeme() , option);
110 | //从歧义词位置开始,递归,生成可选方案
111 | this.forwardPath(c , option);
112 | pathOptions.add(option.copy());
113 | }
114 |
115 | //返回集合中的最优方案
116 | return pathOptions.first();
117 |
118 | }
119 |
120 | /**
121 | * 向前遍历,添加词元,构造一个无歧义词元组合
122 | // * @param LexemePath path
123 | * @return
124 | */
125 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
126 | //发生冲突的Lexeme栈
127 | Stack conflictStack = new Stack();
128 | QuickSortSet.Cell c = lexemeCell;
129 | //迭代遍历Lexeme链表
130 | while(c != null && c.getLexeme() != null){
131 | if(!option.addNotCrossLexeme(c.getLexeme())){
132 | //词元交叉,添加失败则加入lexemeStack栈
133 | conflictStack.push(c);
134 | }
135 | c = c.getNext();
136 | }
137 | return conflictStack;
138 | }
139 |
140 | /**
141 | * 回滚词元链,直到它能够接受指定的词元
142 | // * @param lexeme
143 | * @param l
144 | */
145 | private void backPath(Lexeme l , LexemePath option){
146 | while(option.checkCross(l)){
147 | option.removeTail();
148 | }
149 |
150 | }
151 |
152 | }
153 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/QuickSortSet.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK分词器专用的Lexem快速排序集合
29 | */
30 | class QuickSortSet {
31 | //链表头
32 | private Cell head;
33 | //链表尾
34 | private Cell tail;
35 | //链表的实际大小
36 | private int size;
37 |
38 | QuickSortSet(){
39 | this.size = 0;
40 | }
41 |
42 | /**
43 | * 向链表集合添加词元
44 | * @param lexeme
45 | */
46 | boolean addLexeme(Lexeme lexeme){
47 | Cell newCell = new Cell(lexeme);
48 | if(this.size == 0){
49 | this.head = newCell;
50 | this.tail = newCell;
51 | this.size++;
52 | return true;
53 |
54 | }else{
55 | if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合
56 | return false;
57 |
58 | }else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
59 | this.tail.next = newCell;
60 | newCell.prev = this.tail;
61 | this.tail = newCell;
62 | this.size++;
63 | return true;
64 |
65 | }else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
66 | this.head.prev = newCell;
67 | newCell.next = this.head;
68 | this.head = newCell;
69 | this.size++;
70 | return true;
71 |
72 | }else{
73 | //从尾部上逆
74 | Cell index = this.tail;
75 | while(index != null && index.compareTo(newCell) > 0){
76 | index = index.prev;
77 | }
78 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合
79 | return false;
80 |
81 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置
82 | newCell.prev = index;
83 | newCell.next = index.next;
84 | index.next.prev = newCell;
85 | index.next = newCell;
86 | this.size++;
87 | return true;
88 | }
89 | }
90 | }
91 | return false;
92 | }
93 |
94 | /**
95 | * 返回链表头部元素
96 | * @return
97 | */
98 | Lexeme peekFirst(){
99 | if(this.head != null){
100 | return this.head.lexeme;
101 | }
102 | return null;
103 | }
104 |
105 | /**
106 | * 取出链表集合的第一个元素
107 | * @return Lexeme
108 | */
109 | Lexeme pollFirst(){
110 | if(this.size == 1){
111 | Lexeme first = this.head.lexeme;
112 | this.head = null;
113 | this.tail = null;
114 | this.size--;
115 | return first;
116 | }else if(this.size > 1){
117 | Lexeme first = this.head.lexeme;
118 | this.head = this.head.next;
119 | this.size --;
120 | return first;
121 | }else{
122 | return null;
123 | }
124 | }
125 |
126 | /**
127 | * 返回链表尾部元素
128 | * @return
129 | */
130 | Lexeme peekLast(){
131 | if(this.tail != null){
132 | return this.tail.lexeme;
133 | }
134 | return null;
135 | }
136 |
137 | /**
138 | * 取出链表集合的最后一个元素
139 | * @return Lexeme
140 | */
141 | Lexeme pollLast(){
142 | if(this.size == 1){
143 | Lexeme last = this.head.lexeme;
144 | this.head = null;
145 | this.tail = null;
146 | this.size--;
147 | return last;
148 |
149 | }else if(this.size > 1){
150 | Lexeme last = this.tail.lexeme;
151 | this.tail = this.tail.prev;
152 | this.size--;
153 | return last;
154 |
155 | }else{
156 | return null;
157 | }
158 | }
159 |
160 | /**
161 | * 返回集合大小
162 | * @return
163 | */
164 | int size(){
165 | return this.size;
166 | }
167 |
168 | /**
169 | * 判断集合是否为空
170 | * @return
171 | */
172 | boolean isEmpty(){
173 | return this.size == 0;
174 | }
175 |
176 | /**
177 | * 返回lexeme链的头部
178 | * @return
179 | */
180 | Cell getHead(){
181 | return this.head;
182 | }
183 |
184 | /**
185 | *
186 | * IK 中文分词 版本 5.0
187 | * IK Analyzer release 5.0
188 | *
189 | * Licensed to the Apache Software Foundation (ASF) under one or more
190 | * contributor license agreements. See the NOTICE file distributed with
191 | * this work for additional information regarding copyright ownership.
192 | * The ASF licenses this file to You under the Apache License, Version 2.0
193 | * (the "License"); you may not use this file except in compliance with
194 | * the License. You may obtain a copy of the License at
195 | *
196 | * http://www.apache.org/licenses/LICENSE-2.0
197 | *
198 | * Unless required by applicable law or agreed to in writing, software
199 | * distributed under the License is distributed on an "AS IS" BASIS,
200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | * See the License for the specific language governing permissions and
202 | * limitations under the License.
203 | *
204 | * 源代码由林良益(linliangyi2005@gmail.com)提供
205 | * 版权声明 2012,乌龙茶工作室
206 | * provided by Linliangyi and copyright 2012 by Oolong studio
207 | *
208 | * QuickSortSet集合单元
209 | *
210 | */
211 | class Cell implements Comparable{
212 | private Cell prev;
213 | private Cell next;
214 | private Lexeme lexeme;
215 |
216 | Cell(Lexeme lexeme){
217 | if(lexeme == null){
218 | throw new IllegalArgumentException("lexeme must not be null");
219 | }
220 | this.lexeme = lexeme;
221 | }
222 |
223 | public int compareTo(Cell o) {
224 | return this.lexeme.compareTo(o.lexeme);
225 | }
226 |
227 | public Cell getPrev(){
228 | return this.prev;
229 | }
230 |
231 | public Cell getNext(){
232 | return this.next;
233 | }
234 |
235 | public Lexeme getLexeme(){
236 | return this.lexeme;
237 | }
238 | }
239 | }
240 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.HashSet;
28 | import java.util.LinkedList;
29 | import java.util.List;
30 | import java.util.Set;
31 |
32 | import org.wltea.analyzer.dic.Dictionary;
33 | import org.wltea.analyzer.dic.Hit;
34 |
35 | /**
36 | *
37 | * 中文数量词子分词器
38 | */
39 | class CN_QuantifierSegmenter implements ISegmenter{
40 |
41 | //子分词器标签
42 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
43 |
44 | //中文数词
45 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum
46 | private static Set ChnNumberChars = new HashSet();
47 | static{
48 | char[] ca = Chn_Num.toCharArray();
49 | for(char nChar : ca){
50 | ChnNumberChars.add(nChar);
51 | }
52 | }
53 |
54 | /*
55 | * 词元的开始位置,
56 | * 同时作为子分词器状态标识
57 | * 当start > -1 时,标识当前的分词器正在处理字符
58 | */
59 | private int nStart;
60 | /*
61 | * 记录词元结束位置
62 | * end记录的是在词元中最后一个出现的合理的数词结束
63 | */
64 | private int nEnd;
65 |
66 | //待处理的量词hit队列
67 | private List countHits;
68 |
69 |
70 | CN_QuantifierSegmenter(){
71 | nStart = -1;
72 | nEnd = -1;
73 | this.countHits = new LinkedList();
74 | }
75 |
76 | /**
77 | * 分词
78 | */
79 | public void analyze(AnalyzeContext context) {
80 | //处理中文数词
81 | this.processCNumber(context);
82 | //处理中文量词
83 | this.processCount(context);
84 |
85 | //判断是否锁定缓冲区
86 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
87 | //对缓冲区解锁
88 | context.unlockBuffer(SEGMENTER_NAME);
89 | }else{
90 | context.lockBuffer(SEGMENTER_NAME);
91 | }
92 | }
93 |
94 |
95 | /**
96 | * 重置子分词器状态
97 | */
98 | public void reset() {
99 | nStart = -1;
100 | nEnd = -1;
101 | countHits.clear();
102 | }
103 |
104 | /**
105 | * 处理数词
106 | */
107 | private void processCNumber(AnalyzeContext context){
108 | if(nStart == -1 && nEnd == -1){//初始状态
109 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
110 | && ChnNumberChars.contains(context.getCurrentChar())){
111 | //记录数词的起始、结束位置
112 | nStart = context.getCursor();
113 | nEnd = context.getCursor();
114 | }
115 | }else{//正在处理状态
116 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
117 | && ChnNumberChars.contains(context.getCurrentChar())){
118 | //记录数词的结束位置
119 | nEnd = context.getCursor();
120 | }else{
121 | //输出数词
122 | this.outputNumLexeme(context);
123 | //重置头尾指针
124 | nStart = -1;
125 | nEnd = -1;
126 | }
127 | }
128 |
129 | //缓冲区已经用完,还有尚未输出的数词
130 | if(context.isBufferConsumed() && (nStart != -1 && nEnd != -1)){
131 | //输出数词
132 | outputNumLexeme(context);
133 | //重置头尾指针
134 | nStart = -1;
135 | nEnd = -1;
136 | }
137 | }
138 |
139 | /**
140 | * 处理中文量词
141 | * @param context
142 | */
143 | private void processCount(AnalyzeContext context){
144 | // 判断是否需要启动量词扫描
145 | if(!this.needCountScan(context)){
146 | return;
147 | }
148 |
149 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
150 |
151 | //优先处理countHits中的hit
152 | if(!this.countHits.isEmpty()){
153 | //处理词段队列
154 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
155 | for(Hit hit : tmpArray){
156 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
157 | if(hit.isMatch()){
158 | //输出当前的词
159 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
160 | context.addLexeme(newLexeme);
161 |
162 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
163 | this.countHits.remove(hit);
164 | }
165 |
166 | }else if(hit.isUnmatch()){
167 | //hit不是词,移除
168 | this.countHits.remove(hit);
169 | }
170 | }
171 | }
172 |
173 | //*********************************
174 | //对当前指针位置的字符进行单字匹配
175 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
176 | if(singleCharHit.isMatch()){//首字成量词词
177 | //输出当前的词
178 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
179 | context.addLexeme(newLexeme);
180 |
181 | //同时也是词前缀
182 | if(singleCharHit.isPrefix()){
183 | //前缀匹配则放入hit列表
184 | this.countHits.add(singleCharHit);
185 | }
186 | }else if(singleCharHit.isPrefix()){//首字为量词前缀
187 | //前缀匹配则放入hit列表
188 | this.countHits.add(singleCharHit);
189 | }
190 |
191 |
192 | }else{
193 | //输入的不是中文字符
194 | //清空未成形的量词
195 | this.countHits.clear();
196 | }
197 |
198 | //缓冲区数据已经读完,还有尚未输出的量词
199 | if(context.isBufferConsumed()){
200 | //清空未成形的量词
201 | this.countHits.clear();
202 | }
203 | }
204 |
205 | /**
206 | * 判断是否需要扫描量词
207 | * @return
208 | */
209 | private boolean needCountScan(AnalyzeContext context){
210 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
211 | //正在处理中文数词,或者正在处理量词
212 | return true;
213 | }else{
214 | //找到一个相邻的数词
215 | if(!context.getOrgLexemes().isEmpty()){
216 | Lexeme l = context.getOrgLexemes().peekLast();
217 | if((Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType())
218 | && (l.getBegin() + l.getLength() == context.getCursor())){
219 | return true;
220 | }
221 | }
222 | }
223 | return false;
224 | }
225 |
226 | /**
227 | * 添加数词词元到结果集
228 | * @param context
229 | */
230 | private void outputNumLexeme(AnalyzeContext context){
231 | if(nStart > -1 && nEnd > -1){
232 | //输出数词
233 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
234 | context.addLexeme(newLexeme);
235 |
236 | }
237 | }
238 |
239 | }
240 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/LexemePath.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 |
28 | /**
29 | * Lexeme链(路径)
30 | */
31 | class LexemePath extends QuickSortSet implements Comparable{
32 |
33 | //起始位置
34 | private int pathBegin;
35 | //结束
36 | private int pathEnd;
37 | //词元链的有效字符长度
38 | private int payloadLength;
39 |
40 | LexemePath(){
41 | this.pathBegin = -1;
42 | this.pathEnd = -1;
43 | this.payloadLength = 0;
44 | }
45 |
46 | /**
47 | * 向LexemePath追加相交的Lexeme
48 | * @param lexeme
49 | * @return
50 | */
51 | boolean addCrossLexeme(Lexeme lexeme){
52 | if(this.isEmpty()){
53 | this.addLexeme(lexeme);
54 | this.pathBegin = lexeme.getBegin();
55 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
56 | this.payloadLength += lexeme.getLength();
57 | return true;
58 |
59 | }else if(this.checkCross(lexeme)){
60 | this.addLexeme(lexeme);
61 | if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){
62 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
63 | }
64 | this.payloadLength = this.pathEnd - this.pathBegin;
65 | return true;
66 |
67 | }else{
68 | return false;
69 |
70 | }
71 | }
72 |
73 | /**
74 | * 向LexemePath追加不相交的Lexeme
75 | * @param lexeme
76 | * @return
77 | */
78 | boolean addNotCrossLexeme(Lexeme lexeme){
79 | if(this.isEmpty()){
80 | this.addLexeme(lexeme);
81 | this.pathBegin = lexeme.getBegin();
82 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
83 | this.payloadLength += lexeme.getLength();
84 | return true;
85 |
86 | }else if(this.checkCross(lexeme)){
87 | return false;
88 |
89 | }else{
90 | this.addLexeme(lexeme);
91 | this.payloadLength += lexeme.getLength();
92 | Lexeme head = this.peekFirst();
93 | this.pathBegin = head.getBegin();
94 | Lexeme tail = this.peekLast();
95 | this.pathEnd = tail.getBegin() + tail.getLength();
96 | return true;
97 |
98 | }
99 | }
100 |
101 | /**
102 | * 移除尾部的Lexeme
103 | * @return
104 | */
105 | Lexeme removeTail(){
106 | Lexeme tail = this.pollLast();
107 | if(this.isEmpty()){
108 | this.pathBegin = -1;
109 | this.pathEnd = -1;
110 | this.payloadLength = 0;
111 | }else{
112 | this.payloadLength -= tail.getLength();
113 | Lexeme newTail = this.peekLast();
114 | this.pathEnd = newTail.getBegin() + newTail.getLength();
115 | }
116 | return tail;
117 | }
118 |
119 | /**
120 | * 检测词元位置交叉(有歧义的切分)
121 | * @param lexeme
122 | * @return
123 | */
124 | boolean checkCross(Lexeme lexeme){
125 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
126 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength());
127 | }
128 |
129 | int getPathBegin() {
130 | return pathBegin;
131 | }
132 |
133 | int getPathEnd() {
134 | return pathEnd;
135 | }
136 |
137 | /**
138 | * 获取Path的有效词长
139 | * @return
140 | */
141 | int getPayloadLength(){
142 | return this.payloadLength;
143 | }
144 |
145 | /**
146 | * 获取LexemePath的路径长度
147 | * @return
148 | */
149 | int getPathLength(){
150 | return this.pathEnd - this.pathBegin;
151 | }
152 |
153 |
154 | /**
155 | * X权重(词元长度积)
156 | * @return
157 | */
158 | int getXWeight(){
159 | int product = 1;
160 | Cell c = this.getHead();
161 | while( c != null && c.getLexeme() != null){
162 | product *= c.getLexeme().getLength();
163 | c = c.getNext();
164 | }
165 | return product;
166 | }
167 |
168 | /**
169 | * 词元位置权重
170 | * @return
171 | */
172 | int getPWeight(){
173 | int pWeight = 0;
174 | int p = 0;
175 | Cell c = this.getHead();
176 | while( c != null && c.getLexeme() != null){
177 | p++;
178 | pWeight += p * c.getLexeme().getLength() ;
179 | c = c.getNext();
180 | }
181 | return pWeight;
182 | }
183 |
184 | LexemePath copy(){
185 | LexemePath theCopy = new LexemePath();
186 | theCopy.pathBegin = this.pathBegin;
187 | theCopy.pathEnd = this.pathEnd;
188 | theCopy.payloadLength = this.payloadLength;
189 | Cell c = this.getHead();
190 | while( c != null && c.getLexeme() != null){
191 | theCopy.addLexeme(c.getLexeme());
192 | c = c.getNext();
193 | }
194 | return theCopy;
195 | }
196 |
197 | public int compareTo(LexemePath o) {
198 | //比较有效文本长度
199 | if(this.payloadLength > o.payloadLength){
200 | return -1;
201 | }else if(this.payloadLength < o.payloadLength){
202 | return 1;
203 | }else{
204 | //比较词元个数,越少越好
205 | if(this.size() < o.size()){
206 | return -1;
207 | }else if (this.size() > o.size()){
208 | return 1;
209 | }else{
210 | //路径跨度越大越好
211 | if(this.getPathLength() > o.getPathLength()){
212 | return -1;
213 | }else if(this.getPathLength() < o.getPathLength()){
214 | return 1;
215 | }else {
216 | //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
217 | if(this.pathEnd > o.pathEnd){
218 | return -1;
219 | }else if(pathEnd < o.pathEnd){
220 | return 1;
221 | }else{
222 | //词长越平均越好
223 | if(this.getXWeight() > o.getXWeight()){
224 | return -1;
225 | }else if(this.getXWeight() < o.getXWeight()){
226 | return 1;
227 | }else {
228 | //词元位置权重比较
229 | if(this.getPWeight() > o.getPWeight()){
230 | return -1;
231 | }else if(this.getPWeight() < o.getPWeight()){
232 | return 1;
233 | }
234 |
235 | }
236 | }
237 | }
238 | }
239 | }
240 | return 0;
241 | }
242 |
243 | public String toString(){
244 | StringBuffer sb = new StringBuffer();
245 | sb.append("pathBegin : ").append(pathBegin).append("\r\n");
246 | sb.append("pathEnd : ").append(pathEnd).append("\r\n");
247 | sb.append("payloadLength : ").append(payloadLength).append("\r\n");
248 | Cell head = this.getHead();
249 | while(head != null){
250 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
251 | head = head.getNext();
252 | }
253 | return sb.toString();
254 | }
255 |
256 | }
257 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/Lexeme.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK词元对象
29 | */
30 | public class Lexeme implements Comparable{
31 | //lexemeType常量
32 | //未知
33 | public static final int TYPE_UNKNOWN = 0;
34 | //英文
35 | public static final int TYPE_ENGLISH = 1;
36 | //数字
37 | public static final int TYPE_ARABIC = 2;
38 | //英文数字混合
39 | public static final int TYPE_LETTER = 3;
40 | //中文词元
41 | public static final int TYPE_CNWORD = 4;
42 | //中文单字
43 | public static final int TYPE_CNCHAR = 64;
44 | //日韩文字
45 | public static final int TYPE_OTHER_CJK = 8;
46 | //中文数词
47 | public static final int TYPE_CNUM = 16;
48 | //中文量词
49 | public static final int TYPE_COUNT = 32;
50 | //中文数量词
51 | public static final int TYPE_CQUAN = 48;
52 |
53 | //词元的起始位移
54 | private int offset;
55 | //词元的相对起始位置
56 | private int begin;
57 | //词元的长度
58 | private int length;
59 | //词元文本
60 | private String lexemeText;
61 | //词元类型
62 | private int lexemeType;
63 |
64 |
65 | public Lexeme(int offset , int begin , int length , int lexemeType){
66 | this.offset = offset;
67 | this.begin = begin;
68 | if(length < 0){
69 | throw new IllegalArgumentException("length < 0");
70 | }
71 | this.length = length;
72 | this.lexemeType = lexemeType;
73 | }
74 |
75 | /*
76 | * 判断词元相等算法
77 | * 起始位置偏移、起始位置、终止位置相同
78 | * @see java.lang.Object#equals(Object o)
79 | */
80 | public boolean equals(Object o){
81 | if(o == null){
82 | return false;
83 | }
84 |
85 | if(this == o){
86 | return true;
87 | }
88 |
89 | if(o instanceof Lexeme){
90 | Lexeme other = (Lexeme)o;
91 | if(this.offset == other.getOffset()
92 | && this.begin == other.getBegin()
93 | && this.length == other.getLength()){
94 | return true;
95 | }else{
96 | return false;
97 | }
98 | }else{
99 | return false;
100 | }
101 | }
102 |
103 | /*
104 | * 词元哈希编码算法
105 | * @see java.lang.Object#hashCode()
106 | */
107 | public int hashCode(){
108 | int absBegin = getBeginPosition();
109 | int absEnd = getEndPosition();
110 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
111 | }
112 |
113 | /*
114 | * 词元在排序集合中的比较算法
115 | * @see java.lang.Comparable#compareTo(java.lang.Object)
116 | */
117 | public int compareTo(Lexeme other) {
118 | //起始位置优先
119 | if(this.begin < other.getBegin()){
120 | return -1;
121 | }else if(this.begin == other.getBegin()){
122 | //词元长度优先
123 | if(this.length > other.getLength()){
124 | return -1;
125 | }else if(this.length == other.getLength()){
126 | return 0;
127 | }else {//this.length < other.getLength()
128 | return 1;
129 | }
130 |
131 | }else{//this.begin > other.getBegin()
132 | return 1;
133 | }
134 | }
135 |
136 | public int getOffset() {
137 | return offset;
138 | }
139 |
140 | public void setOffset(int offset) {
141 | this.offset = offset;
142 | }
143 |
144 | public int getBegin() {
145 | return begin;
146 | }
147 | /**
148 | * 获取词元在文本中的起始位置
149 | * @return int
150 | */
151 | public int getBeginPosition(){
152 | return offset + begin;
153 | }
154 |
155 | public void setBegin(int begin) {
156 | this.begin = begin;
157 | }
158 |
159 | /**
160 | * 获取词元在文本中的结束位置
161 | * @return int
162 | */
163 | public int getEndPosition(){
164 | return offset + begin + length;
165 | }
166 |
167 | /**
168 | * 获取词元的字符长度
169 | * @return int
170 | */
171 | public int getLength(){
172 | return this.length;
173 | }
174 |
175 | public void setLength(int length) {
176 | if(this.length < 0){
177 | throw new IllegalArgumentException("length < 0");
178 | }
179 | this.length = length;
180 | }
181 |
182 | /**
183 | * 获取词元的文本内容
184 | * @return String
185 | */
186 | public String getLexemeText() {
187 | if(lexemeText == null){
188 | return "";
189 | }
190 | return lexemeText;
191 | }
192 |
193 | public void setLexemeText(String lexemeText) {
194 | if(lexemeText == null){
195 | this.lexemeText = "";
196 | this.length = 0;
197 | }else{
198 | this.lexemeText = lexemeText;
199 | this.length = lexemeText.length();
200 | }
201 | }
202 |
203 | /**
204 | * 获取词元类型
205 | * @return int
206 | */
207 | public int getLexemeType() {
208 | return lexemeType;
209 | }
210 |
211 | /**
212 | * 获取词元类型标示字符串
213 | * @return String
214 | */
215 | public String getLexemeTypeString(){
216 | switch(lexemeType) {
217 |
218 | case TYPE_ENGLISH :
219 | return "ENGLISH";
220 |
221 | case TYPE_ARABIC :
222 | return "ARABIC";
223 |
224 | case TYPE_LETTER :
225 | return "LETTER";
226 |
227 | case TYPE_CNWORD :
228 | return "CN_WORD";
229 |
230 | case TYPE_CNCHAR :
231 | return "CN_CHAR";
232 |
233 | case TYPE_OTHER_CJK :
234 | return "OTHER_CJK";
235 |
236 | case TYPE_COUNT :
237 | return "COUNT";
238 |
239 | case TYPE_CNUM :
240 | return "TYPE_CNUM";
241 |
242 | case TYPE_CQUAN:
243 | return "TYPE_CQUAN";
244 |
245 | default :
246 | return "UNKONW";
247 | }
248 | }
249 |
250 |
251 | public void setLexemeType(int lexemeType) {
252 | this.lexemeType = lexemeType;
253 | }
254 |
255 | /**
256 | * 合并两个相邻的词元
257 | * @param l
258 | * @param lexemeType
259 | * @return boolean 词元是否成功合并
260 | */
261 | public boolean append(Lexeme l , int lexemeType){
262 | if(l != null && this.getEndPosition() == l.getBeginPosition()){
263 | this.length += l.getLength();
264 | this.lexemeType = lexemeType;
265 | return true;
266 | }else {
267 | return false;
268 | }
269 | }
270 |
271 |
272 | /**
273 | *
274 | */
275 | public String toString(){
276 | StringBuffer strbuf = new StringBuffer();
277 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
278 | strbuf.append(" : ").append(this.lexemeText).append(" : \t");
279 | strbuf.append(this.getLexemeTypeString());
280 | return strbuf.toString();
281 | }
282 |
283 |
284 | }
285 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | IK Analysis for Elasticsearch
2 | =============================
3 |
4 | The IK Analysis plugin integrates Lucene IK analyzer (http://code.google.com/p/ik-analyzer/) into elasticsearch, support customized dictionary.
5 |
6 | Analyzer: `ik_smart` , `ik_max_word` , Tokenizer: `ik_smart` , `ik_max_word`
7 |
8 | Versions
9 | --------
10 |
11 | IK version | ES version
12 | -----------|-----------
13 | master | 7.x -> master
14 | 6.x| 6.x
15 | 5.x| 5.x
16 | 1.10.6 | 2.4.6
17 | 1.9.5 | 2.3.5
18 | 1.8.1 | 2.2.1
19 | 1.7.0 | 2.1.1
20 | 1.5.0 | 2.0.0
21 | 1.2.6 | 1.0.0
22 | 1.2.5 | 0.90.x
23 | 1.1.3 | 0.20.x
24 | 1.0.0 | 0.16.2 -> 0.19.0
25 |
26 | Install
27 | -------
28 |
29 | 1.download or compile
30 |
31 | * optional 1 - download pre-build package from here: https://github.com/medcl/elasticsearch-analysis-ik/releases
32 |
33 | create plugin folder `cd your-es-root/plugins/ && mkdir ik`
34 |
35 | unzip plugin to folder `your-es-root/plugins/ik`
36 |
37 | * optional 2 - use elasticsearch-plugin to install ( supported from version v5.5.1 ):
38 |
39 | ```
40 | ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.3.0/elasticsearch-analysis-ik-6.3.0.zip
41 | ```
42 |
43 | NOTE: replace `6.3.0` to your own elasticsearch version
44 |
45 | 2.restart elasticsearch
46 |
47 |
48 |
49 | #### Quick Example
50 |
51 | 1.create a index
52 |
53 | ```bash
54 | curl -XPUT http://localhost:9200/index
55 | ```
56 |
57 | 2.create a mapping
58 |
59 | ```bash
60 | curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d'
61 | {
62 | "properties": {
63 | "content": {
64 | "type": "text",
65 | "analyzer": "ik_max_word",
66 | "search_analyzer": "ik_smart"
67 | }
68 | }
69 |
70 | }'
71 | ```
72 |
73 | 3.index some docs
74 |
75 | ```bash
76 | curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d'
77 | {"content":"美国留给伊拉克的是个烂摊子吗"}
78 | '
79 | ```
80 |
81 | ```bash
82 | curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d'
83 | {"content":"公安部:各地校车将享最高路权"}
84 | '
85 | ```
86 |
87 | ```bash
88 | curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d'
89 | {"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
90 | '
91 | ```
92 |
93 | ```bash
94 | curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d'
95 | {"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
96 | '
97 | ```
98 |
99 | 4.query with highlighting
100 |
101 | ```bash
102 | curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d'
103 | {
104 | "query" : { "match" : { "content" : "中国" }},
105 | "highlight" : {
106 | "pre_tags" : ["", ""],
107 | "post_tags" : ["", ""],
108 | "fields" : {
109 | "content" : {}
110 | }
111 | }
112 | }
113 | '
114 | ```
115 |
116 | Result
117 |
118 | ```json
119 | {
120 | "took": 14,
121 | "timed_out": false,
122 | "_shards": {
123 | "total": 5,
124 | "successful": 5,
125 | "failed": 0
126 | },
127 | "hits": {
128 | "total": 2,
129 | "max_score": 2,
130 | "hits": [
131 | {
132 | "_index": "index",
133 | "_type": "fulltext",
134 | "_id": "4",
135 | "_score": 2,
136 | "_source": {
137 | "content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
138 | },
139 | "highlight": {
140 | "content": [
141 | "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首 "
142 | ]
143 | }
144 | },
145 | {
146 | "_index": "index",
147 | "_type": "fulltext",
148 | "_id": "3",
149 | "_score": 2,
150 | "_source": {
151 | "content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"
152 | },
153 | "highlight": {
154 | "content": [
155 | "均每天扣1艘中国渔船 "
156 | ]
157 | }
158 | }
159 | ]
160 | }
161 | }
162 | ```
163 |
164 | ### Dictionary Configuration
165 |
166 | `IKAnalyzer.cfg.xml` can be located at `{conf}/analysis-ik/config/IKAnalyzer.cfg.xml`
167 | or `{plugins}/elasticsearch-analysis-ik-*/config/IKAnalyzer.cfg.xml`
168 |
169 | ```xml
170 |
171 |
172 |
173 | IK Analyzer 扩展配置
174 |
175 | custom/mydict.dic;custom/single_word_low_freq.dic
176 |
177 | custom/ext_stopword.dic
178 |
179 | location
180 |
181 | http://xxx.com/xxx.dic
182 |
183 | ```
184 |
185 | ### 热更新 IK 分词使用方法
186 |
187 | 目前该插件支持热更新 IK 分词,通过上文在 IK 配置文件中提到的如下配置
188 |
189 | ```xml
190 |
191 | location
192 |
193 | location
194 | ```
195 |
196 | 其中 `location` 是指一个 url,比如 `http://yoursite.com/getCustomDict`,该请求只需满足以下两点即可完成分词热更新。
197 |
198 | 1. 该 http 请求需要返回两个头部(header),一个是 `Last-Modified`,一个是 `ETag`,这两者都是字符串类型,只要有一个发生变化,该插件就会去抓取新的分词进而更新词库。
199 |
200 | 2. 该 http 请求返回的内容格式是一行一个分词,换行符用 `\n` 即可。
201 |
202 | 满足上面两点要求就可以实现热更新分词了,不需要重启 ES 实例。
203 |
204 | 可以将需自动更新的热词放在一个 UTF-8 编码的 .txt 文件里,放在 nginx 或其他简易 http server 下,当 .txt 文件修改时,http server 会在客户端请求该文件时自动返回相应的 Last-Modified 和 ETag。可以另外做一个工具来从业务系统提取相关词汇,并更新这个 .txt 文件。
205 |
206 | have fun.
207 |
208 | 常见问题
209 | -------
210 |
211 | 1.自定义词典为什么没有生效?
212 |
213 | 请确保你的扩展词典的文本格式为 UTF8 编码
214 |
215 | 2.如何手动安装?
216 |
217 |
218 | ```bash
219 | git clone https://github.com/medcl/elasticsearch-analysis-ik
220 | cd elasticsearch-analysis-ik
221 | git checkout tags/{version}
222 | mvn clean
223 | mvn compile
224 | mvn package
225 | ```
226 |
227 | 拷贝和解压release下的文件: #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip 到你的 elasticsearch 插件目录, 如: plugins/ik
228 | 重启elasticsearch
229 |
230 | 3.分词测试失败
231 | 请在某个索引下调用analyze接口测试,而不是直接调用analyze接口
232 | 如:
233 | ```bash
234 | curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: application/json' -d'
235 | {
236 | "text":"中华人民共和国MN","tokenizer": "my_ik"
237 | }'
238 | ```
239 |
240 |
241 | 4. ik_max_word 和 ik_smart 什么区别?
242 |
243 |
244 | ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query;
245 |
246 | ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询。
247 |
248 | Changes
249 | ------
250 | *自 v5.0.0 起*
251 |
252 | - 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word`
253 |
254 |
255 | Thanks
256 | ------
257 | YourKit supports IK Analysis for ElasticSearch project with its full-featured Java Profiler.
258 | YourKit, LLC is the creator of innovative and intelligent tools for profiling
259 | Java and .NET applications. Take a look at YourKit's leading software products:
260 | YourKit Java Profiler and
261 | YourKit .NET Profiler.
262 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/DictSegment.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | import java.util.Arrays;
29 | import java.util.Map;
30 | import java.util.concurrent.ConcurrentHashMap;
31 |
32 | /**
33 | * 词典树分段,表示词典树的一个分枝
34 | */
35 | class DictSegment implements Comparable{
36 |
37 | //公用字典表,存储汉字
38 | private static final Map charMap = new ConcurrentHashMap(16 , 0.95f);
39 | //数组大小上限
40 | private static final int ARRAY_LENGTH_LIMIT = 3;
41 |
42 |
43 | //Map存储结构
44 | private Map childrenMap;
45 | //数组方式存储结构
46 | private DictSegment[] childrenArray;
47 |
48 |
49 | //当前节点上存储的字符
50 | private Character nodeChar;
51 | //当前节点存储的Segment数目
52 | //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
53 | private int storeSize = 0;
54 | //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
55 | private int nodeState = 0;
56 |
57 |
58 | DictSegment(Character nodeChar){
59 | if(nodeChar == null){
60 | throw new IllegalArgumentException("node char cannot be empty");
61 | }
62 | this.nodeChar = nodeChar;
63 | }
64 |
65 | Character getNodeChar() {
66 | return nodeChar;
67 | }
68 |
69 | /*
70 | * 判断是否有下一个节点
71 | */
72 | boolean hasNextNode(){
73 | return this.storeSize > 0;
74 | }
75 |
76 | /**
77 | * 匹配词段
78 | * @param charArray
79 | * @return Hit
80 | */
81 | Hit match(char[] charArray){
82 | return this.match(charArray , 0 , charArray.length , null);
83 | }
84 |
85 | /**
86 | * 匹配词段
87 | * @param charArray
88 | * @param begin
89 | * @param length
90 | * @return Hit
91 | */
92 | Hit match(char[] charArray , int begin , int length){
93 | return this.match(charArray , begin , length , null);
94 | }
95 |
96 | /**
97 | * 匹配词段
98 | * @param charArray
99 | * @param begin
100 | * @param length
101 | * @param searchHit
102 | * @return Hit
103 | */
104 | Hit match(char[] charArray , int begin , int length , Hit searchHit){
105 |
106 | if(searchHit == null){
107 | //如果hit为空,新建
108 | searchHit= new Hit();
109 | //设置hit的其实文本位置
110 | searchHit.setBegin(begin);
111 | }else{
112 | //否则要将HIT状态重置
113 | searchHit.setUnmatch();
114 | }
115 | //设置hit的当前处理位置
116 | searchHit.setEnd(begin);
117 |
118 | Character keyChar = Character.valueOf(charArray[begin]);
119 | DictSegment ds = null;
120 |
121 | //引用实例变量为本地变量,避免查询时遇到更新的同步问题
122 | DictSegment[] segmentArray = this.childrenArray;
123 | Map segmentMap = this.childrenMap;
124 |
125 | //STEP1 在节点中查找keyChar对应的DictSegment
126 | if(segmentArray != null){
127 | //在数组中查找
128 | DictSegment keySegment = new DictSegment(keyChar);
129 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment);
130 | if(position >= 0){
131 | ds = segmentArray[position];
132 | }
133 |
134 | }else if(segmentMap != null){
135 | //在map中查找
136 | ds = (DictSegment)segmentMap.get(keyChar);
137 | }
138 |
139 | //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
140 | if(ds != null){
141 | if(length > 1){
142 | //词未匹配完,继续往下搜索
143 | return ds.match(charArray, begin + 1 , length - 1 , searchHit);
144 | }else if (length == 1){
145 |
146 | //搜索最后一个char
147 | if(ds.nodeState == 1){
148 | //添加HIT状态为完全匹配
149 | searchHit.setMatch();
150 | }
151 | if(ds.hasNextNode()){
152 | //添加HIT状态为前缀匹配
153 | searchHit.setPrefix();
154 | //记录当前位置的DictSegment
155 | searchHit.setMatchedDictSegment(ds);
156 | }
157 | return searchHit;
158 | }
159 |
160 | }
161 | //STEP3 没有找到DictSegment, 将HIT设置为不匹配
162 | return searchHit;
163 | }
164 |
165 | /**
166 | * 加载填充词典片段
167 | * @param charArray
168 | */
169 | void fillSegment(char[] charArray){
170 | this.fillSegment(charArray, 0 , charArray.length , 1);
171 | }
172 |
173 | /**
174 | * 屏蔽词典中的一个词
175 | * @param charArray
176 | */
177 | void disableSegment(char[] charArray){
178 | this.fillSegment(charArray, 0 , charArray.length , 0);
179 | }
180 |
181 | /**
182 | * 加载填充词典片段
183 | * @param charArray
184 | * @param begin
185 | * @param length
186 | * @param enabled
187 | */
188 | private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){
189 | //获取字典表中的汉字对象
190 | Character beginChar = Character.valueOf(charArray[begin]);
191 | Character keyChar = charMap.get(beginChar);
192 | //字典中没有该字,则将其添加入字典
193 | if(keyChar == null){
194 | charMap.put(beginChar, beginChar);
195 | keyChar = beginChar;
196 | }
197 |
198 | //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
199 | DictSegment ds = lookforSegment(keyChar , enabled);
200 | if(ds != null){
201 | //处理keyChar对应的segment
202 | if(length > 1){
203 | //词元还没有完全加入词典树
204 | ds.fillSegment(charArray, begin + 1, length - 1 , enabled);
205 | }else if (length == 1){
206 | //已经是词元的最后一个char,设置当前节点状态为enabled,
207 | //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
208 | ds.nodeState = enabled;
209 | }
210 | }
211 |
212 | }
213 |
214 | /**
215 | * 查找本节点下对应的keyChar的segment *
216 | * @param keyChar
217 | * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
218 | * @return
219 | */
220 | private DictSegment lookforSegment(Character keyChar , int create){
221 |
222 | DictSegment ds = null;
223 |
224 | if(this.storeSize <= ARRAY_LENGTH_LIMIT){
225 | //获取数组容器,如果数组未创建则创建数组
226 | DictSegment[] segmentArray = getChildrenArray();
227 | //搜寻数组
228 | DictSegment keySegment = new DictSegment(keyChar);
229 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment);
230 | if(position >= 0){
231 | ds = segmentArray[position];
232 | }
233 |
234 | //遍历数组后没有找到对应的segment
235 | if(ds == null && create == 1){
236 | ds = keySegment;
237 | if(this.storeSize < ARRAY_LENGTH_LIMIT){
238 | //数组容量未满,使用数组存储
239 | segmentArray[this.storeSize] = ds;
240 | //segment数目+1
241 | this.storeSize++;
242 | Arrays.sort(segmentArray , 0 , this.storeSize);
243 |
244 | }else{
245 | //数组容量已满,切换Map存储
246 | //获取Map容器,如果Map未创建,则创建Map
247 | Map segmentMap = getChildrenMap();
248 | //将数组中的segment迁移到Map中
249 | migrate(segmentArray , segmentMap);
250 | //存储新的segment
251 | segmentMap.put(keyChar, ds);
252 | //segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
253 | this.storeSize++;
254 | //释放当前的数组引用
255 | this.childrenArray = null;
256 | }
257 |
258 | }
259 |
260 | }else{
261 | //获取Map容器,如果Map未创建,则创建Map
262 | Map segmentMap = getChildrenMap();
263 | //搜索Map
264 | ds = (DictSegment)segmentMap.get(keyChar);
265 | if(ds == null && create == 1){
266 | //构造新的segment
267 | ds = new DictSegment(keyChar);
268 | segmentMap.put(keyChar , ds);
269 | //当前节点存储segment数目+1
270 | this.storeSize ++;
271 | }
272 | }
273 |
274 | return ds;
275 | }
276 |
277 |
278 | /**
279 | * 获取数组容器
280 | * 线程同步方法
281 | */
282 | private DictSegment[] getChildrenArray(){
283 | synchronized(this){
284 | if(this.childrenArray == null){
285 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
286 | }
287 | }
288 | return this.childrenArray;
289 | }
290 |
291 | /**
292 | * 获取Map容器
293 | * 线程同步方法
294 | */
295 | private Map getChildrenMap(){
296 | synchronized(this){
297 | if(this.childrenMap == null){
298 | this.childrenMap = new ConcurrentHashMap(ARRAY_LENGTH_LIMIT * 2,0.8f);
299 | }
300 | }
301 | return this.childrenMap;
302 | }
303 |
304 | /**
305 | * 将数组中的segment迁移到Map中
306 | * @param segmentArray
307 | */
308 | private void migrate(DictSegment[] segmentArray , Map segmentMap){
309 | for(DictSegment segment : segmentArray){
310 | if(segment != null){
311 | segmentMap.put(segment.nodeChar, segment);
312 | }
313 | }
314 | }
315 |
316 | /**
317 | * 实现Comparable接口
318 | * @param o
319 | * @return int
320 | */
321 | public int compareTo(DictSegment o) {
322 | //对当前节点存储的char进行比较
323 | return this.nodeChar.compareTo(o.nodeChar);
324 | }
325 |
326 | }
327 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.Arrays;
28 |
29 | /**
30 | *
31 | * 英文字符及阿拉伯数字子分词器
32 | */
33 | class LetterSegmenter implements ISegmenter {
34 |
35 | //子分词器标签
36 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
37 | //链接符号
38 | private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'};
39 |
40 | //数字符号
41 | private static final char[] Num_Connector = new char[]{',' , '.'};
42 |
43 | /*
44 | * 词元的开始位置,
45 | * 同时作为子分词器状态标识
46 | * 当start > -1 时,标识当前的分词器正在处理字符
47 | */
48 | private int start;
49 | /*
50 | * 记录词元结束位置
51 | * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
52 | */
53 | private int end;
54 |
55 | /*
56 | * 字母起始位置
57 | */
58 | private int englishStart;
59 |
60 | /*
61 | * 字母结束位置
62 | */
63 | private int englishEnd;
64 |
65 | /*
66 | * 阿拉伯数字起始位置
67 | */
68 | private int arabicStart;
69 |
70 | /*
71 | * 阿拉伯数字结束位置
72 | */
73 | private int arabicEnd;
74 |
75 | LetterSegmenter(){
76 | Arrays.sort(Letter_Connector);
77 | Arrays.sort(Num_Connector);
78 | this.start = -1;
79 | this.end = -1;
80 | this.englishStart = -1;
81 | this.englishEnd = -1;
82 | this.arabicStart = -1;
83 | this.arabicEnd = -1;
84 | }
85 |
86 |
87 | /* (non-Javadoc)
88 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
89 | */
90 | public void analyze(AnalyzeContext context) {
91 | boolean bufferLockFlag = false;
92 | //处理英文字母
93 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
94 | //处理阿拉伯字母
95 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
96 | //处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
97 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
98 |
99 | //判断是否锁定缓冲区
100 | if(bufferLockFlag){
101 | context.lockBuffer(SEGMENTER_NAME);
102 | }else{
103 | //对缓冲区解锁
104 | context.unlockBuffer(SEGMENTER_NAME);
105 | }
106 | }
107 |
108 | /* (non-Javadoc)
109 | * @see org.wltea.analyzer.core.ISegmenter#reset()
110 | */
111 | public void reset() {
112 | this.start = -1;
113 | this.end = -1;
114 | this.englishStart = -1;
115 | this.englishEnd = -1;
116 | this.arabicStart = -1;
117 | this.arabicEnd = -1;
118 | }
119 |
120 | /**
121 | * 处理数字字母混合输出
122 | * 如:windos2000 | linliangyi2005@gmail.com
123 | // * @param input
124 | * @param context
125 | * @return
126 | */
127 | private boolean processMixLetter(AnalyzeContext context){
128 | boolean needLock = false;
129 |
130 | if(this.start == -1){//当前的分词器尚未开始处理字符
131 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
132 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
133 | //记录起始指针的位置,标明分词器进入处理状态
134 | this.start = context.getCursor();
135 | this.end = start;
136 | }
137 |
138 | }else{//当前的分词器正在处理字符
139 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
140 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
141 | //记录下可能的结束位置
142 | this.end = context.getCursor();
143 |
144 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
145 | && this.isLetterConnector(context.getCurrentChar())){
146 | //记录下可能的结束位置
147 | this.end = context.getCursor();
148 | }else{
149 | //遇到非Letter字符,输出词元
150 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
151 | context.addLexeme(newLexeme);
152 | this.start = -1;
153 | this.end = -1;
154 | }
155 | }
156 |
157 | //判断缓冲区是否已经读完
158 | if(context.isBufferConsumed() && (this.start != -1 && this.end != -1)){
159 | //缓冲以读完,输出词元
160 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
161 | context.addLexeme(newLexeme);
162 | this.start = -1;
163 | this.end = -1;
164 | }
165 |
166 | //判断是否锁定缓冲区
167 | if(this.start == -1 && this.end == -1){
168 | //对缓冲区解锁
169 | needLock = false;
170 | }else{
171 | needLock = true;
172 | }
173 | return needLock;
174 | }
175 |
176 | /**
177 | * 处理纯英文字母输出
178 | * @param context
179 | * @return
180 | */
181 | private boolean processEnglishLetter(AnalyzeContext context){
182 | boolean needLock = false;
183 |
184 | if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符
185 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
186 | //记录起始指针的位置,标明分词器进入处理状态
187 | this.englishStart = context.getCursor();
188 | this.englishEnd = this.englishStart;
189 | }
190 | }else {//当前的分词器正在处理英文字符
191 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
192 | //记录当前指针位置为结束位置
193 | this.englishEnd = context.getCursor();
194 | }else{
195 | //遇到非English字符,输出词元
196 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
197 | context.addLexeme(newLexeme);
198 | this.englishStart = -1;
199 | this.englishEnd= -1;
200 | }
201 | }
202 |
203 | //判断缓冲区是否已经读完
204 | if(context.isBufferConsumed() && (this.englishStart != -1 && this.englishEnd != -1)){
205 | //缓冲以读完,输出词元
206 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
207 | context.addLexeme(newLexeme);
208 | this.englishStart = -1;
209 | this.englishEnd= -1;
210 | }
211 |
212 | //判断是否锁定缓冲区
213 | if(this.englishStart == -1 && this.englishEnd == -1){
214 | //对缓冲区解锁
215 | needLock = false;
216 | }else{
217 | needLock = true;
218 | }
219 | return needLock;
220 | }
221 |
222 | /**
223 | * 处理阿拉伯数字输出
224 | * @param context
225 | * @return
226 | */
227 | private boolean processArabicLetter(AnalyzeContext context){
228 | boolean needLock = false;
229 |
230 | if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符
231 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
232 | //记录起始指针的位置,标明分词器进入处理状态
233 | this.arabicStart = context.getCursor();
234 | this.arabicEnd = this.arabicStart;
235 | }
236 | }else {//当前的分词器正在处理数字字符
237 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
238 | //记录当前指针位置为结束位置
239 | this.arabicEnd = context.getCursor();
240 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
241 | && this.isNumConnector(context.getCurrentChar())){
242 | //不输出数字,但不标记结束
243 | }else{
244 | ////遇到非Arabic字符,输出词元
245 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
246 | context.addLexeme(newLexeme);
247 | this.arabicStart = -1;
248 | this.arabicEnd = -1;
249 | }
250 | }
251 |
252 | //判断缓冲区是否已经读完
253 | if(context.isBufferConsumed() && (this.arabicStart != -1 && this.arabicEnd != -1)){
254 | //生成已切分的词元
255 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
256 | context.addLexeme(newLexeme);
257 | this.arabicStart = -1;
258 | this.arabicEnd = -1;
259 | }
260 |
261 | //判断是否锁定缓冲区
262 | if(this.arabicStart == -1 && this.arabicEnd == -1){
263 | //对缓冲区解锁
264 | needLock = false;
265 | }else{
266 | needLock = true;
267 | }
268 | return needLock;
269 | }
270 |
271 | /**
272 | * 判断是否是字母连接符号
273 | * @param input
274 | * @return
275 | */
276 | private boolean isLetterConnector(char input){
277 | int index = Arrays.binarySearch(Letter_Connector, input);
278 | return index >= 0;
279 | }
280 |
281 | /**
282 | * 判断是否是数字连接符号
283 | * @param input
284 | * @return
285 | */
286 | private boolean isNumConnector(char input){
287 | int index = Arrays.binarySearch(Num_Connector, input);
288 | return index >= 0;
289 | }
290 | }
291 |
--------------------------------------------------------------------------------
/licenses/lucene-NOTICE.txt:
--------------------------------------------------------------------------------
1 | Apache Lucene
2 | Copyright 2014 The Apache Software Foundation
3 |
4 | This product includes software developed at
5 | The Apache Software Foundation (http://www.apache.org/).
6 |
7 | Includes software from other Apache Software Foundation projects,
8 | including, but not limited to:
9 | - Apache Ant
10 | - Apache Jakarta Regexp
11 | - Apache Commons
12 | - Apache Xerces
13 |
14 | ICU4J, (under analysis/icu) is licensed under an MIT styles license
15 | and Copyright (c) 1995-2008 International Business Machines Corporation and others
16 |
17 | Some data files (under analysis/icu/src/data) are derived from Unicode data such
18 | as the Unicode Character Database. See http://unicode.org/copyright.html for more
19 | details.
20 |
21 | Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
22 | BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
23 |
24 | The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
25 | automatically generated with the moman/finenight FSA library, created by
26 | Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
27 | see http://sites.google.com/site/rrettesite/moman and
28 | http://bitbucket.org/jpbarrette/moman/overview/
29 |
30 | The class org.apache.lucene.util.WeakIdentityMap was derived from
31 | the Apache CXF project and is Apache License 2.0.
32 |
33 | The Google Code Prettify is Apache License 2.0.
34 | See http://code.google.com/p/google-code-prettify/
35 |
36 | JUnit (junit-4.10) is licensed under the Common Public License v. 1.0
37 | See http://junit.sourceforge.net/cpl-v10.html
38 |
39 | This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin
40 | g Package (jaspell): http://jaspell.sourceforge.net/
41 | License: The BSD License (http://www.opensource.org/licenses/bsd-license.php)
42 |
43 | The snowball stemmers in
44 | analysis/common/src/java/net/sf/snowball
45 | were developed by Martin Porter and Richard Boulton.
46 | The snowball stopword lists in
47 | analysis/common/src/resources/org/apache/lucene/analysis/snowball
48 | were developed by Martin Porter and Richard Boulton.
49 | The full snowball package is available from
50 | http://snowball.tartarus.org/
51 |
52 | The KStem stemmer in
53 | analysis/common/src/org/apache/lucene/analysis/en
54 | was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
55 | under the BSD-license.
56 |
57 | The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
58 | stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
59 | analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
60 | analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
61 | analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
62 | analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
63 | analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
64 | See http://members.unine.ch/jacques.savoy/clef/index.html.
65 |
66 | The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
67 | (common) are based on BSD-licensed reference implementations created by Jacques Savoy and
68 | Ljiljana Dolamic. These files reside in:
69 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
70 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
71 | analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
72 | analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
73 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
74 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
75 | analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
76 | analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
77 | analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
78 | analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
79 | analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
80 |
81 | The Stempel analyzer (stempel) includes BSD-licensed software developed
82 | by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
83 | and Edmond Nolan.
84 |
85 | The Polish analyzer (stempel) comes with a default
86 | stopword list that is BSD-licensed created by the Carrot2 project. The file resides
87 | in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
88 | See http://project.carrot2.org/license.html.
89 |
90 | The SmartChineseAnalyzer source code (smartcn) was
91 | provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
92 |
93 | WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
94 | is derived from Unicode data such as the Unicode Character Database.
95 | See http://unicode.org/copyright.html for more details.
96 |
97 | The Morfologik analyzer (morfologik) includes BSD-licensed software
98 | developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
99 |
100 | Morfologik uses data from Polish ispell/myspell dictionary
101 | (http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
102 | LGPL and Creative Commons ShareAlike.
103 |
104 | Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
105 | (http://sgjp.pl/morfeusz/)
106 |
107 | Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original
108 | source code for this can be found at http://www.eclipse.org/jetty/downloads.php
109 |
110 | ===========================================================================
111 | Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
112 | ===========================================================================
113 |
114 | This software includes a binary and/or source version of data from
115 |
116 | mecab-ipadic-2.7.0-20070801
117 |
118 | which can be obtained from
119 |
120 | http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
121 |
122 | or
123 |
124 | http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
125 |
126 | ===========================================================================
127 | mecab-ipadic-2.7.0-20070801 Notice
128 | ===========================================================================
129 |
130 | Nara Institute of Science and Technology (NAIST),
131 | the copyright holders, disclaims all warranties with regard to this
132 | software, including all implied warranties of merchantability and
133 | fitness, in no event shall NAIST be liable for
134 | any special, indirect or consequential damages or any damages
135 | whatsoever resulting from loss of use, data or profits, whether in an
136 | action of contract, negligence or other tortuous action, arising out
137 | of or in connection with the use or performance of this software.
138 |
139 | A large portion of the dictionary entries
140 | originate from ICOT Free Software. The following conditions for ICOT
141 | Free Software applies to the current dictionary as well.
142 |
143 | Each User may also freely distribute the Program, whether in its
144 | original form or modified, to any third party or parties, PROVIDED
145 | that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
146 | on, or be attached to, the Program, which is distributed substantially
147 | in the same form as set out herein and that such intended
148 | distribution, if actually made, will neither violate or otherwise
149 | contravene any of the laws and regulations of the countries having
150 | jurisdiction over the User or the intended distribution itself.
151 |
152 | NO WARRANTY
153 |
154 | The program was produced on an experimental basis in the course of the
155 | research and development conducted during the project and is provided
156 | to users as so produced on an experimental basis. Accordingly, the
157 | program is provided without any warranty whatsoever, whether express,
158 | implied, statutory or otherwise. The term "warranty" used herein
159 | includes, but is not limited to, any warranty of the quality,
160 | performance, merchantability and fitness for a particular purpose of
161 | the program and the nonexistence of any infringement or violation of
162 | any right of any third party.
163 |
164 | Each user of the program will agree and understand, and be deemed to
165 | have agreed and understood, that there is no warranty whatsoever for
166 | the program and, accordingly, the entire risk arising from or
167 | otherwise connected with the program is assumed by the user.
168 |
169 | Therefore, neither ICOT, the copyright holder, or any other
170 | organization that participated in or was otherwise related to the
171 | development of the program and their respective officials, directors,
172 | officers and other employees shall be held liable for any and all
173 | damages, including, without limitation, general, special, incidental
174 | and consequential damages, arising out of or otherwise in connection
175 | with the use or inability to use the program or any product, material
176 | or result produced or otherwise obtained by using the program,
177 | regardless of whether they have been advised of, or otherwise had
178 | knowledge of, the possibility of such damages at any time during the
179 | project or thereafter. Each user will be deemed to have agreed to the
180 | foregoing by his or her commencement of use of the program. The term
181 | "use" as used herein includes, but is not limited to, the use,
182 | modification, copying and distribution of the program and the
183 | production of secondary products from the program.
184 |
185 | In the case where the program, whether in its original form or
186 | modified, was distributed or delivered to or received by a user from
187 | any person, organization or entity other than ICOT, unless it makes or
188 | grants independently of ICOT any specific warranty to the user in
189 | writing, such person, organization or entity, will also be exempted
190 | from and not be held liable to the user for any such damages as noted
191 | above as far as the program is concerned.
192 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.io.IOException;
28 | import java.io.Reader;
29 | import java.util.HashMap;
30 | import java.util.HashSet;
31 | import java.util.LinkedList;
32 | import java.util.Map;
33 | import java.util.Set;
34 |
35 | import org.wltea.analyzer.cfg.Configuration;
36 | import org.wltea.analyzer.dic.Dictionary;
37 |
38 | /**
39 | *
40 | * 分词器上下文状态
41 | *
42 | */
43 | class AnalyzeContext {
44 |
45 | //默认缓冲区大小
46 | private static final int BUFF_SIZE = 4096;
47 | //缓冲区耗尽的临界值
48 | private static final int BUFF_EXHAUST_CRITICAL = 100;
49 |
50 |
51 | //字符串读取缓冲
52 | private char[] segmentBuff;
53 | //字符类型数组
54 | private int[] charTypes;
55 |
56 |
57 | //记录Reader内已分析的字串总长度
58 | //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
59 | private int buffOffset;
60 | //当前缓冲区位置指针
61 | private int cursor;
62 | //最近一次读入的,可处理的字串长度
63 | private int available;
64 |
65 |
66 | //子分词器锁
67 | //该集合非空,说明有子分词器在占用segmentBuff
68 | private Set buffLocker;
69 |
70 | //原始分词结果集合,未经歧义处理
71 | private QuickSortSet orgLexemes;
72 | //LexemePath位置索引表
73 | private Map pathMap;
74 | //最终分词结果集
75 | private LinkedList results;
76 | //分词器配置项
77 | private Configuration cfg;
78 |
79 | public AnalyzeContext(Configuration configuration){
80 | this.cfg = configuration;
81 | this.segmentBuff = new char[BUFF_SIZE];
82 | this.charTypes = new int[BUFF_SIZE];
83 | this.buffLocker = new HashSet();
84 | this.orgLexemes = new QuickSortSet();
85 | this.pathMap = new HashMap();
86 | this.results = new LinkedList();
87 | }
88 |
89 | int getCursor(){
90 | return this.cursor;
91 | }
92 |
93 | char[] getSegmentBuff(){
94 | return this.segmentBuff;
95 | }
96 |
97 | char getCurrentChar(){
98 | return this.segmentBuff[this.cursor];
99 | }
100 |
101 | int getCurrentCharType(){
102 | return this.charTypes[this.cursor];
103 | }
104 |
105 | int getBufferOffset(){
106 | return this.buffOffset;
107 | }
108 |
109 | /**
110 | * 根据context的上下文情况,填充segmentBuff
111 | * @param reader
112 | * @return 返回待分析的(有效的)字串长度
113 | * @throws java.io.IOException
114 | */
115 | int fillBuffer(Reader reader) throws IOException{
116 | int readCount = 0;
117 | if(this.buffOffset == 0){
118 | //首次读取reader
119 | readCount = reader.read(segmentBuff);
120 | }else{
121 | int offset = this.available - this.cursor;
122 | if(offset > 0){
123 | //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
124 | System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset);
125 | readCount = offset;
126 | }
127 | //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
128 | readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset);
129 | }
130 | //记录最后一次从Reader中读入的可用字符长度
131 | this.available = readCount;
132 | //重置当前指针
133 | this.cursor = 0;
134 | return readCount;
135 | }
136 |
137 | /**
138 | * 初始化buff指针,处理第一个字符
139 | */
140 | void initCursor(){
141 | this.cursor = 0;
142 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
143 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
144 | }
145 |
146 | /**
147 | * 指针+1
148 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false
149 | * 并处理当前字符
150 | */
151 | boolean moveCursor(){
152 | if(this.cursor < this.available - 1){
153 | this.cursor++;
154 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
155 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
156 | return true;
157 | }else{
158 | return false;
159 | }
160 | }
161 |
162 | /**
163 | * 设置当前segmentBuff为锁定状态
164 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
165 | * @param segmenterName
166 | */
167 | void lockBuffer(String segmenterName){
168 | this.buffLocker.add(segmenterName);
169 | }
170 |
171 | /**
172 | * 移除指定的子分词器名,释放对segmentBuff的占用
173 | * @param segmenterName
174 | */
175 | void unlockBuffer(String segmenterName){
176 | this.buffLocker.remove(segmenterName);
177 | }
178 |
179 | /**
180 | * 只要buffLocker中存在segmenterName
181 | * 则buffer被锁定
182 | * @return boolean 缓冲去是否被锁定
183 | */
184 | boolean isBufferLocked(){
185 | return this.buffLocker.size() > 0;
186 | }
187 |
188 | /**
189 | * 判断当前segmentBuff是否已经用完
190 | * 当前执针cursor移至segmentBuff末端this.available - 1
191 | * @return
192 | */
193 | boolean isBufferConsumed(){
194 | return this.cursor == this.available - 1;
195 | }
196 |
197 | /**
198 | * 判断segmentBuff是否需要读取新数据
199 | *
200 | * 满足一下条件时,
201 | * 1.available == BUFF_SIZE 表示buffer满载
202 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
203 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer
204 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作)
205 | * @return
206 | */
207 | boolean needRefillBuffer(){
208 | return this.available == BUFF_SIZE
209 | && this.cursor < this.available - 1
210 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL
211 | && !this.isBufferLocked();
212 | }
213 |
214 | /**
215 | * 累计当前的segmentBuff相对于reader起始位置的位移
216 | */
217 | void markBufferOffset(){
218 | this.buffOffset += this.cursor;
219 | }
220 |
221 | /**
222 | * 向分词结果集添加词元
223 | * @param lexeme
224 | */
225 | void addLexeme(Lexeme lexeme){
226 | this.orgLexemes.addLexeme(lexeme);
227 | }
228 |
229 | /**
230 | * 添加分词结果路径
231 | * 路径起始位置 ---> 路径 映射表
232 | * @param path
233 | */
234 | void addLexemePath(LexemePath path){
235 | if(path != null){
236 | this.pathMap.put(path.getPathBegin(), path);
237 | }
238 | }
239 |
240 |
241 | /**
242 | * 返回原始分词结果
243 | * @return
244 | */
245 | QuickSortSet getOrgLexemes(){
246 | return this.orgLexemes;
247 | }
248 |
249 | /**
250 | * 推送分词结果到结果集合
251 | * 1.从buff头部遍历到this.cursor已处理位置
252 | * 2.将map中存在的分词结果推入results
253 | * 3.将map中不存在的CJDK字符以单字方式推入results
254 | */
255 | void outputToResult(){
256 | int index = 0;
257 | for( ; index <= this.cursor ;){
258 | //跳过非CJK字符
259 | if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){
260 | index++;
261 | continue;
262 | }
263 | //从pathMap找出对应index位置的LexemePath
264 | LexemePath path = this.pathMap.get(index);
265 | if(path != null){
266 | //输出LexemePath中的lexeme到results集合
267 | Lexeme l = path.pollFirst();
268 | while(l != null){
269 | this.results.add(l);
270 | //字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字
271 | /*int innerIndex = index + 1;
272 | for (; innerIndex < index + l.getLength(); innerIndex++) {
273 | Lexeme innerL = path.peekFirst();
274 | if (innerL != null && innerIndex == innerL.getBegin()) {
275 | this.outputSingleCJK(innerIndex - 1);
276 | }
277 | }*/
278 |
279 | //将index移至lexeme后
280 | index = l.getBegin() + l.getLength();
281 | l = path.pollFirst();
282 | if(l != null){
283 | //输出path内部,词元间遗漏的单字
284 | for(;index < l.getBegin();index++){
285 | this.outputSingleCJK(index);
286 | }
287 | }
288 | }
289 | }else{//pathMap中找不到index对应的LexemePath
290 | //单字输出
291 | this.outputSingleCJK(index);
292 | index++;
293 | }
294 | }
295 | //清空当前的Map
296 | this.pathMap.clear();
297 | }
298 |
299 | /**
300 | * 对CJK字符进行单字输出
301 | * @param index
302 | */
303 | private void outputSingleCJK(int index){
304 | if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){
305 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR);
306 | this.results.add(singleCharLexeme);
307 | }else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){
308 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK);
309 | this.results.add(singleCharLexeme);
310 | }
311 | }
312 |
313 | /**
314 | * 返回lexeme
315 | *
316 | * 同时处理合并
317 | * @return
318 | */
319 | Lexeme getNextLexeme(){
320 | //从结果集取出,并移除第一个Lexme
321 | Lexeme result = this.results.pollFirst();
322 | while(result != null){
323 | //数量词合并
324 | this.compound(result);
325 | if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
326 | //是停止词继续取列表的下一个
327 | result = this.results.pollFirst();
328 | }else{
329 | //不是停止词, 生成lexeme的词元文本,输出
330 | result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
331 | break;
332 | }
333 | }
334 | return result;
335 | }
336 |
337 | /**
338 | * 重置分词上下文状态
339 | */
340 | void reset(){
341 | this.buffLocker.clear();
342 | this.orgLexemes = new QuickSortSet();
343 | this.available =0;
344 | this.buffOffset = 0;
345 | this.charTypes = new int[BUFF_SIZE];
346 | this.cursor = 0;
347 | this.results.clear();
348 | this.segmentBuff = new char[BUFF_SIZE];
349 | this.pathMap.clear();
350 | }
351 |
352 | /**
353 | * 组合词元
354 | */
355 | private void compound(Lexeme result){
356 |
357 | if(!this.cfg.isUseSmart()){
358 | return ;
359 | }
360 | //数量词合并处理
361 | if(!this.results.isEmpty()){
362 |
363 | if(Lexeme.TYPE_ARABIC == result.getLexemeType()){
364 | Lexeme nextLexeme = this.results.peekFirst();
365 | boolean appendOk = false;
366 | if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){
367 | //合并英文数词+中文数词
368 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
369 | }else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
370 | //合并英文数词+中文量词
371 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
372 | }
373 | if(appendOk){
374 | //弹出
375 | this.results.pollFirst();
376 | }
377 | }
378 |
379 | //可能存在第二轮合并
380 | if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){
381 | Lexeme nextLexeme = this.results.peekFirst();
382 | boolean appendOk = false;
383 | if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
384 | //合并中文数词+中文量词
385 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
386 | }
387 | if(appendOk){
388 | //弹出
389 | this.results.pollFirst();
390 | }
391 | }
392 |
393 | }
394 | }
395 |
396 | }
397 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | elasticsearch-analysis-ik
6 | 4.0.0
7 | org.elasticsearch
8 | elasticsearch-analysis-ik
9 | ${elasticsearch.version}
10 | jar
11 | IK Analyzer for Elasticsearch
12 | 2011
13 |
14 |
15 | 8.3.3
16 | 1.8
17 | ${project.basedir}/src/main/assemblies/plugin.xml
18 | analysis-ik
19 | org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin
20 | true
21 | false
22 | true
23 | 4E899B30
24 | true
25 |
26 |
27 |
28 |
29 | The Apache Software License, Version 2.0
30 | http://www.apache.org/licenses/LICENSE-2.0.txt
31 | repo
32 |
33 |
34 |
35 |
36 |
37 | Medcl
38 | medcl@elastic.co
39 | elastic
40 | http://www.elastic.co
41 |
42 |
43 |
44 |
45 | scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git
46 | scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git
47 |
48 | http://github.com/medcl/elasticsearch-analysis-ik
49 |
50 |
51 |
52 | org.sonatype.oss
53 | oss-parent
54 | 9
55 |
56 |
57 |
58 |
59 | oss.sonatype.org
60 | https://oss.sonatype.org/content/repositories/snapshots
61 |
62 |
63 | oss.sonatype.org
64 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
65 |
66 |
67 |
68 |
69 |
70 | oss.sonatype.org
71 | OSS Sonatype
72 | true
73 | true
74 | https://oss.sonatype.org/content/repositories/releases/
75 |
76 |
77 |
78 |
79 |
80 | org.elasticsearch
81 | elasticsearch
82 | ${elasticsearch.version}
83 | compile
84 |
85 |
86 |
87 |
88 | org.apache.httpcomponents
89 | httpclient
90 | 4.5.2
91 |
92 |
93 |
94 | org.apache.logging.log4j
95 | log4j-api
96 | 2.17.1
97 |
98 |
99 |
100 | org.hamcrest
101 | hamcrest-core
102 | 1.3
103 | test
104 |
105 |
106 |
107 | org.hamcrest
108 | hamcrest-library
109 | 1.3
110 | test
111 |
112 |
113 | junit
114 | junit
115 | 4.12
116 | test
117 |
118 |
119 |
120 |
121 |
122 |
123 | org.apache.maven.plugins
124 | maven-compiler-plugin
125 | 3.5.1
126 |
127 | ${maven.compiler.target}
128 | ${maven.compiler.target}
129 |
130 |
131 |
132 | org.apache.maven.plugins
133 | maven-surefire-plugin
134 | 2.11
135 |
136 |
137 | **/*Tests.java
138 |
139 |
140 |
141 |
142 | org.apache.maven.plugins
143 | maven-source-plugin
144 | 2.1.2
145 |
146 |
147 | attach-sources
148 |
149 | jar
150 |
151 |
152 |
153 |
154 |
155 | maven-assembly-plugin
156 |
157 |
158 | false
159 | ${project.build.directory}/releases/
160 |
161 | ${basedir}/src/main/assemblies/plugin.xml
162 |
163 |
164 |
165 | fully.qualified.MainClass
166 |
167 |
168 |
169 |
170 |
171 | package
172 |
173 | single
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 | disable-java8-doclint
183 |
184 | [1.8,)
185 |
186 |
187 | -Xdoclint:none
188 |
189 |
190 |
191 | release
192 |
193 |
194 |
195 | org.sonatype.plugins
196 | nexus-staging-maven-plugin
197 | 1.6.3
198 | true
199 |
200 | oss
201 | https://oss.sonatype.org/
202 | true
203 |
204 |
205 |
206 | org.apache.maven.plugins
207 | maven-release-plugin
208 | 2.1
209 |
210 | true
211 | false
212 | release
213 | deploy
214 |
215 |
216 |
217 | org.apache.maven.plugins
218 | maven-compiler-plugin
219 | 3.5.1
220 |
221 | ${maven.compiler.target}
222 | ${maven.compiler.target}
223 |
224 |
225 |
226 | org.apache.maven.plugins
227 | maven-gpg-plugin
228 | 1.5
229 |
230 |
231 | sign-artifacts
232 | verify
233 |
234 | sign
235 |
236 |
237 |
238 |
239 |
240 | org.apache.maven.plugins
241 | maven-source-plugin
242 | 2.2.1
243 |
244 |
245 | attach-sources
246 |
247 | jar-no-fork
248 |
249 |
250 |
251 |
252 |
253 | org.apache.maven.plugins
254 | maven-javadoc-plugin
255 | 2.9
256 |
257 |
258 | attach-javadocs
259 |
260 | jar
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/Dictionary.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | import java.io.BufferedReader;
29 | import java.io.FileInputStream;
30 | import java.io.FileNotFoundException;
31 | import java.io.IOException;
32 | import java.io.InputStream;
33 | import java.io.InputStreamReader;
34 | import java.nio.file.attribute.BasicFileAttributes;
35 | import java.nio.file.Files;
36 | import java.nio.file.FileVisitResult;
37 | import java.nio.file.Path;
38 | import java.nio.file.SimpleFileVisitor;
39 | import java.security.AccessController;
40 | import java.security.PrivilegedAction;
41 | import java.util.*;
42 | import java.util.concurrent.Executors;
43 | import java.util.concurrent.ScheduledExecutorService;
44 | import java.util.concurrent.TimeUnit;
45 |
46 | import org.apache.http.Header;
47 | import org.apache.http.HttpEntity;
48 | import org.apache.http.client.ClientProtocolException;
49 | import org.apache.http.client.config.RequestConfig;
50 | import org.apache.http.client.methods.CloseableHttpResponse;
51 | import org.apache.http.client.methods.HttpGet;
52 | import org.apache.http.impl.client.CloseableHttpClient;
53 | import org.apache.http.impl.client.HttpClients;
54 | import org.elasticsearch.SpecialPermission;
55 | import org.elasticsearch.core.PathUtils;
56 | import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
57 | import org.wltea.analyzer.cfg.Configuration;
58 | import org.apache.logging.log4j.Logger;
59 | import org.wltea.analyzer.help.ESPluginLoggerFactory;
60 |
61 |
62 | /**
63 | * 词典管理类,单子模式
64 | */
65 | public class Dictionary {
66 |
67 | /*
68 | * 词典单子实例
69 | */
70 | private static Dictionary singleton;
71 |
72 | private DictSegment _MainDict;
73 |
74 | private DictSegment _QuantifierDict;
75 |
76 | private DictSegment _StopWords;
77 |
78 | /**
79 | * 配置对象
80 | */
81 | private Configuration configuration;
82 |
83 | private static final Logger logger = ESPluginLoggerFactory.getLogger(Dictionary.class.getName());
84 |
85 | private static ScheduledExecutorService pool = Executors.newScheduledThreadPool(1);
86 |
87 | private static final String PATH_DIC_MAIN = "main.dic";
88 | private static final String PATH_DIC_SURNAME = "surname.dic";
89 | private static final String PATH_DIC_QUANTIFIER = "quantifier.dic";
90 | private static final String PATH_DIC_SUFFIX = "suffix.dic";
91 | private static final String PATH_DIC_PREP = "preposition.dic";
92 | private static final String PATH_DIC_STOP = "stopword.dic";
93 |
94 | private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
95 | private final static String EXT_DICT = "ext_dict";
96 | private final static String REMOTE_EXT_DICT = "remote_ext_dict";
97 | private final static String EXT_STOP = "ext_stopwords";
98 | private final static String REMOTE_EXT_STOP = "remote_ext_stopwords";
99 |
100 | private Path conf_dir;
101 | private Properties props;
102 |
103 | private Dictionary(Configuration cfg) {
104 | this.configuration = cfg;
105 | this.props = new Properties();
106 | this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
107 | Path configFile = conf_dir.resolve(FILE_NAME);
108 |
109 | InputStream input = null;
110 | try {
111 | logger.info("try load config from {}", configFile);
112 | input = new FileInputStream(configFile.toFile());
113 | } catch (FileNotFoundException e) {
114 | conf_dir = cfg.getConfigInPluginDir();
115 | configFile = conf_dir.resolve(FILE_NAME);
116 | try {
117 | logger.info("try load config from {}", configFile);
118 | input = new FileInputStream(configFile.toFile());
119 | } catch (FileNotFoundException ex) {
120 | // We should report origin exception
121 | logger.error("ik-analyzer", e);
122 | }
123 | }
124 | if (input != null) {
125 | try {
126 | props.loadFromXML(input);
127 | } catch (IOException e) {
128 | logger.error("ik-analyzer", e);
129 | }
130 | }
131 | }
132 |
133 | private String getProperty(String key){
134 | if(props!=null){
135 | return props.getProperty(key);
136 | }
137 | return null;
138 | }
139 | /**
140 | * 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
141 | * 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
142 | *
143 | * @return Dictionary
144 | */
145 | public static synchronized void initial(Configuration cfg) {
146 | if (singleton == null) {
147 | synchronized (Dictionary.class) {
148 | if (singleton == null) {
149 |
150 | singleton = new Dictionary(cfg);
151 | singleton.loadMainDict();
152 | singleton.loadSurnameDict();
153 | singleton.loadQuantifierDict();
154 | singleton.loadSuffixDict();
155 | singleton.loadPrepDict();
156 | singleton.loadStopWordDict();
157 |
158 | if(cfg.isEnableRemoteDict()){
159 | // 建立监控线程
160 | for (String location : singleton.getRemoteExtDictionarys()) {
161 | // 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
162 | pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
163 | }
164 | for (String location : singleton.getRemoteExtStopWordDictionarys()) {
165 | pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
166 | }
167 | }
168 |
169 | }
170 | }
171 | }
172 | }
173 |
174 | private void walkFileTree(List files, Path path) {
175 | if (Files.isRegularFile(path)) {
176 | files.add(path.toString());
177 | } else if (Files.isDirectory(path)) try {
178 | Files.walkFileTree(path, new SimpleFileVisitor() {
179 | @Override
180 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) {
181 | files.add(file.toString());
182 | return FileVisitResult.CONTINUE;
183 | }
184 | @Override
185 | public FileVisitResult visitFileFailed(Path file, IOException e) {
186 | logger.error("[Ext Loading] listing files", e);
187 | return FileVisitResult.CONTINUE;
188 | }
189 | });
190 | } catch (IOException e) {
191 | logger.error("[Ext Loading] listing files", e);
192 | } else {
193 | logger.warn("[Ext Loading] file not found: " + path);
194 | }
195 | }
196 |
197 | private void loadDictFile(DictSegment dict, Path file, boolean critical, String name) {
198 | try (InputStream is = new FileInputStream(file.toFile())) {
199 | BufferedReader br = new BufferedReader(
200 | new InputStreamReader(is, "UTF-8"), 512);
201 | String word = br.readLine();
202 | if (word != null) {
203 | if (word.startsWith("\uFEFF"))
204 | word = word.substring(1);
205 | for (; word != null; word = br.readLine()) {
206 | word = word.trim();
207 | if (word.isEmpty()) continue;
208 | dict.fillSegment(word.toCharArray());
209 | }
210 | }
211 | } catch (FileNotFoundException e) {
212 | logger.error("ik-analyzer: " + name + " not found", e);
213 | if (critical) throw new RuntimeException("ik-analyzer: " + name + " not found!!!", e);
214 | } catch (IOException e) {
215 | logger.error("ik-analyzer: " + name + " loading failed", e);
216 | }
217 | }
218 |
219 | private List getExtDictionarys() {
220 | List extDictFiles = new ArrayList(2);
221 | String extDictCfg = getProperty(EXT_DICT);
222 | if (extDictCfg != null) {
223 |
224 | String[] filePaths = extDictCfg.split(";");
225 | for (String filePath : filePaths) {
226 | if (filePath != null && !"".equals(filePath.trim())) {
227 | Path file = PathUtils.get(getDictRoot(), filePath.trim());
228 | walkFileTree(extDictFiles, file);
229 |
230 | }
231 | }
232 | }
233 | return extDictFiles;
234 | }
235 |
236 | private List getRemoteExtDictionarys() {
237 | List remoteExtDictFiles = new ArrayList(2);
238 | String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT);
239 | if (remoteExtDictCfg != null) {
240 |
241 | String[] filePaths = remoteExtDictCfg.split(";");
242 | for (String filePath : filePaths) {
243 | if (filePath != null && !"".equals(filePath.trim())) {
244 | remoteExtDictFiles.add(filePath);
245 |
246 | }
247 | }
248 | }
249 | return remoteExtDictFiles;
250 | }
251 |
252 | private List getExtStopWordDictionarys() {
253 | List extStopWordDictFiles = new ArrayList(2);
254 | String extStopWordDictCfg = getProperty(EXT_STOP);
255 | if (extStopWordDictCfg != null) {
256 |
257 | String[] filePaths = extStopWordDictCfg.split(";");
258 | for (String filePath : filePaths) {
259 | if (filePath != null && !"".equals(filePath.trim())) {
260 | Path file = PathUtils.get(getDictRoot(), filePath.trim());
261 | walkFileTree(extStopWordDictFiles, file);
262 |
263 | }
264 | }
265 | }
266 | return extStopWordDictFiles;
267 | }
268 |
269 | private List getRemoteExtStopWordDictionarys() {
270 | List remoteExtStopWordDictFiles = new ArrayList(2);
271 | String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP);
272 | if (remoteExtStopWordDictCfg != null) {
273 |
274 | String[] filePaths = remoteExtStopWordDictCfg.split(";");
275 | for (String filePath : filePaths) {
276 | if (filePath != null && !"".equals(filePath.trim())) {
277 | remoteExtStopWordDictFiles.add(filePath);
278 |
279 | }
280 | }
281 | }
282 | return remoteExtStopWordDictFiles;
283 | }
284 |
285 | private String getDictRoot() {
286 | return conf_dir.toAbsolutePath().toString();
287 | }
288 |
289 |
290 | /**
291 | * 获取词典单子实例
292 | *
293 | * @return Dictionary 单例对象
294 | */
295 | public static Dictionary getSingleton() {
296 | if (singleton == null) {
297 | throw new IllegalStateException("ik dict has not been initialized yet, please call initial method first.");
298 | }
299 | return singleton;
300 | }
301 |
302 |
303 | /**
304 | * 批量加载新词条
305 | *
306 | * @param words
307 | * Collection词条列表
308 | */
309 | public void addWords(Collection words) {
310 | if (words != null) {
311 | for (String word : words) {
312 | if (word != null) {
313 | // 批量加载词条到主内存词典中
314 | singleton._MainDict.fillSegment(word.trim().toCharArray());
315 | }
316 | }
317 | }
318 | }
319 |
320 | /**
321 | * 批量移除(屏蔽)词条
322 | */
323 | public void disableWords(Collection words) {
324 | if (words != null) {
325 | for (String word : words) {
326 | if (word != null) {
327 | // 批量屏蔽词条
328 | singleton._MainDict.disableSegment(word.trim().toCharArray());
329 | }
330 | }
331 | }
332 | }
333 |
334 | /**
335 | * 检索匹配主词典
336 | *
337 | * @return Hit 匹配结果描述
338 | */
339 | public Hit matchInMainDict(char[] charArray) {
340 | return singleton._MainDict.match(charArray);
341 | }
342 |
343 | /**
344 | * 检索匹配主词典
345 | *
346 | * @return Hit 匹配结果描述
347 | */
348 | public Hit matchInMainDict(char[] charArray, int begin, int length) {
349 | return singleton._MainDict.match(charArray, begin, length);
350 | }
351 |
352 | /**
353 | * 检索匹配量词词典
354 | *
355 | * @return Hit 匹配结果描述
356 | */
357 | public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
358 | return singleton._QuantifierDict.match(charArray, begin, length);
359 | }
360 |
361 | /**
362 | * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
363 | *
364 | * @return Hit
365 | */
366 | public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
367 | DictSegment ds = matchedHit.getMatchedDictSegment();
368 | return ds.match(charArray, currentIndex, 1, matchedHit);
369 | }
370 |
371 | /**
372 | * 判断是否是停止词
373 | *
374 | * @return boolean
375 | */
376 | public boolean isStopWord(char[] charArray, int begin, int length) {
377 | return singleton._StopWords.match(charArray, begin, length).isMatch();
378 | }
379 |
380 | /**
381 | * 加载主词典及扩展词典
382 | */
383 | private void loadMainDict() {
384 | // 建立一个主词典实例
385 | _MainDict = new DictSegment((char) 0);
386 |
387 | // 读取主词典文件
388 | Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
389 | loadDictFile(_MainDict, file, false, "Main Dict");
390 | // 加载扩展词典
391 | this.loadExtDict();
392 | // 加载远程自定义词库
393 | this.loadRemoteExtDict();
394 | }
395 |
396 | /**
397 | * 加载用户配置的扩展词典到主词库表
398 | */
399 | private void loadExtDict() {
400 | // 加载扩展词典配置
401 | List extDictFiles = getExtDictionarys();
402 | if (extDictFiles != null) {
403 | for (String extDictName : extDictFiles) {
404 | // 读取扩展词典文件
405 | logger.info("[Dict Loading] " + extDictName);
406 | Path file = PathUtils.get(extDictName);
407 | loadDictFile(_MainDict, file, false, "Extra Dict");
408 | }
409 | }
410 | }
411 |
412 | /**
413 | * 加载远程扩展词典到主词库表
414 | */
415 | private void loadRemoteExtDict() {
416 | List remoteExtDictFiles = getRemoteExtDictionarys();
417 | for (String location : remoteExtDictFiles) {
418 | logger.info("[Dict Loading] " + location);
419 | List lists = getRemoteWords(location);
420 | // 如果找不到扩展的字典,则忽略
421 | if (lists == null) {
422 | logger.error("[Dict Loading] " + location + " load failed");
423 | continue;
424 | }
425 | for (String theWord : lists) {
426 | if (theWord != null && !"".equals(theWord.trim())) {
427 | // 加载扩展词典数据到主内存词典中
428 | logger.info(theWord);
429 | _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
430 | }
431 | }
432 | }
433 |
434 | }
435 |
436 | private static List getRemoteWords(String location) {
437 | SpecialPermission.check();
438 | return AccessController.doPrivileged((PrivilegedAction>) () -> {
439 | return getRemoteWordsUnprivileged(location);
440 | });
441 | }
442 |
443 | /**
444 | * 从远程服务器上下载自定义词条
445 | */
446 | private static List getRemoteWordsUnprivileged(String location) {
447 |
448 | List buffer = new ArrayList();
449 | RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
450 | .setSocketTimeout(60 * 1000).build();
451 | CloseableHttpClient httpclient = HttpClients.createDefault();
452 | CloseableHttpResponse response;
453 | BufferedReader in;
454 | HttpGet get = new HttpGet(location);
455 | get.setConfig(rc);
456 | try {
457 | response = httpclient.execute(get);
458 | if (response.getStatusLine().getStatusCode() == 200) {
459 |
460 | String charset = "UTF-8";
461 | // 获取编码,默认为utf-8
462 | HttpEntity entity = response.getEntity();
463 | if(entity!=null){
464 | Header contentType = entity.getContentType();
465 | if(contentType!=null&&contentType.getValue()!=null){
466 | String typeValue = contentType.getValue();
467 | if(typeValue!=null&&typeValue.contains("charset=")){
468 | charset = typeValue.substring(typeValue.lastIndexOf("=") + 1);
469 | }
470 | }
471 |
472 | if (entity.getContentLength() > 0 || entity.isChunked()) {
473 | in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
474 | String line;
475 | while ((line = in.readLine()) != null) {
476 | buffer.add(line);
477 | }
478 | in.close();
479 | response.close();
480 | return buffer;
481 | }
482 | }
483 | }
484 | response.close();
485 | } catch (IllegalStateException | IOException e) {
486 | logger.error("getRemoteWords {} error", e, location);
487 | }
488 | return buffer;
489 | }
490 |
491 | /**
492 | * 加载用户扩展的停止词词典
493 | */
494 | private void loadStopWordDict() {
495 | // 建立主词典实例
496 | _StopWords = new DictSegment((char) 0);
497 |
498 | // 读取主词典文件
499 | Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
500 | loadDictFile(_StopWords, file, false, "Main Stopwords");
501 |
502 | // 加载扩展停止词典
503 | List extStopWordDictFiles = getExtStopWordDictionarys();
504 | if (extStopWordDictFiles != null) {
505 | for (String extStopWordDictName : extStopWordDictFiles) {
506 | logger.info("[Dict Loading] " + extStopWordDictName);
507 |
508 | // 读取扩展词典文件
509 | file = PathUtils.get(extStopWordDictName);
510 | loadDictFile(_StopWords, file, false, "Extra Stopwords");
511 | }
512 | }
513 |
514 | // 加载远程停用词典
515 | List remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys();
516 | for (String location : remoteExtStopWordDictFiles) {
517 | logger.info("[Dict Loading] " + location);
518 | List lists = getRemoteWords(location);
519 | // 如果找不到扩展的字典,则忽略
520 | if (lists == null) {
521 | logger.error("[Dict Loading] " + location + " load failed");
522 | continue;
523 | }
524 | for (String theWord : lists) {
525 | if (theWord != null && !"".equals(theWord.trim())) {
526 | // 加载远程词典数据到主内存中
527 | logger.info(theWord);
528 | _StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
529 | }
530 | }
531 | }
532 |
533 | }
534 |
535 | /**
536 | * 加载量词词典
537 | */
538 | private void loadQuantifierDict() {
539 | // 建立一个量词典实例
540 | _QuantifierDict = new DictSegment((char) 0);
541 | // 读取量词词典文件
542 | Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
543 | loadDictFile(_QuantifierDict, file, false, "Quantifier");
544 | }
545 |
546 | private void loadSurnameDict() {
547 | DictSegment _SurnameDict = new DictSegment((char) 0);
548 | Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
549 | loadDictFile(_SurnameDict, file, true, "Surname");
550 | }
551 |
552 | private void loadSuffixDict() {
553 | DictSegment _SuffixDict = new DictSegment((char) 0);
554 | Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
555 | loadDictFile(_SuffixDict, file, true, "Suffix");
556 | }
557 |
558 | private void loadPrepDict() {
559 | DictSegment _PrepDict = new DictSegment((char) 0);
560 | Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
561 | loadDictFile(_PrepDict, file, true, "Preposition");
562 | }
563 |
564 | void reLoadMainDict() {
565 | logger.info("start to reload ik dict.");
566 | // 新开一个实例加载词典,减少加载过程对当前词典使用的影响
567 | Dictionary tmpDict = new Dictionary(configuration);
568 | tmpDict.configuration = getSingleton().configuration;
569 | tmpDict.loadMainDict();
570 | tmpDict.loadStopWordDict();
571 | _MainDict = tmpDict._MainDict;
572 | _StopWords = tmpDict._StopWords;
573 | logger.info("reload ik dict finished.");
574 | }
575 |
576 | }
577 |
--------------------------------------------------------------------------------
/config/extra_single_word_low_freq.dic:
--------------------------------------------------------------------------------
1 | 踧
2 | 覢
3 | 觓
4 | 覛
5 | 覅
6 | 覟
7 | 覗
8 | 覣
9 | 覭
10 | 覂
11 | 觡
12 | 覝
13 | 觟
14 | 褱
15 | 褰
16 | 襒
17 | 覞
18 | 袨
19 | 觏
20 | 赒
21 | 觇
22 | 謍
23 | 讙
24 | 襦
25 | 袤
26 | 誸
27 | 诮
28 | 衩
29 | 茷
30 | 趒
31 | 襌
32 | 诰
33 | 譠
34 | 袄
35 | 聱
36 | 豸
37 | 蠓
38 | 讵
39 | 袅
40 | 诂
41 | 裞
42 | 訄
43 | 荺
44 | 褂
45 | 蠡
46 | 裐
47 | 諴
48 | 芫
49 | 赧
50 | 触
51 | 跫
52 | 褫
53 | 赝
54 | 褡
55 | 衪
56 | 裎
57 | 豜
58 | 褶
59 | 裟
60 | 跏
61 | 袪
62 | 袈
63 | 觐
64 | 跄
65 | 坏
66 | 肱
67 | 裾
68 | 考
69 | 豝
70 | 踰
71 | 覃
72 | 蹓
73 | 黾
74 | 褴
75 | 轲
76 | 裨
77 | 蜇
78 | 鮆
79 | 褥
80 | 誊
81 | 貉
82 | 褊
83 | 蜉
84 | 衔
85 | 詄
86 | 豋
87 | 胼
88 | 荞
89 | 踫
90 | 谗
91 | 耦
92 | 誏
93 | 衮
94 | 胝
95 | 幔
96 | 轭
97 | 赈
98 | 贲
99 | 蓼
100 | 褛
101 | 迵
102 | 觊
103 | 蚜
104 | 讫
105 | 颢
106 | 葄
107 | 觎
108 | 诎
109 | 謢
110 | 蹧
111 | 邬
112 | 芊
113 | 赣
114 | 囱
115 | 蝎
116 | 夆
117 | 蠋
118 | 蠕
119 | 蹼
120 | 臊
121 | 蛭
122 | 颚
123 | 讴
124 | 踽
125 | 菫
126 | 臾
127 | 薮
128 | 蹒
129 | 谀
130 | 菀
131 | 佶
132 | 摀
133 | 佚
134 | 邸
135 | 跺
136 | 豊
137 | 荔
138 | 锌
139 | 诿
140 | 蕤
141 | 诳
142 | 芩
143 | 蹴
144 | 褉
145 | 觔
146 | 舴
147 | 腋
148 | 颍
149 | 膊
150 | 脯
151 | 荪
152 | 郢
153 | 坛
154 | 轫
155 | 醺
156 | 捺
157 | 姝
158 | 胭
159 | 饷
160 | 谪
161 | 驮
162 | 僮
163 | 踯
164 | 忪
165 | 驷
166 | 躅
167 | 忑
168 | 彧
169 | 衲
170 | 唠
171 | 跚
172 | 吃
173 | 诩
174 | 褓
175 | 诤
176 | 豨
177 | 诋
178 | 菈
179 | 逖
180 | 荟
181 | 裆
182 | 喋
183 | 忖
184 | 闾
185 | 诌
186 | 啻
187 | 铀
188 | 菡
189 | 胱
190 | 蹬
191 | 隹
192 | 鹬
193 | 诒
194 | 轧
195 | 萏
196 | 舶
197 | 鳅
198 | 药
199 | 酯
200 | 夯
201 | 偬
202 | 酝
203 | 跻
204 | 咤
205 | 掬
206 | 呆
207 | 蹶
208 | 踞
209 | 蝌
210 | 咋
211 | 谧
212 | 舫
213 | 啐
214 | 茸
215 | 谟
216 | 嵌
217 | 蜿
218 | 魇
219 | 帷
220 | 觑
221 | 鳍
222 | 谏
223 | 哽
224 | 乓
225 | 蚌
226 | 嗙
227 | 巿
228 | 刽
229 | 踱
230 | 腆
231 | 薏
232 | 蜃
233 | 谑
234 | 躄
235 | 鸾
236 | 齁
237 | 腼
238 | 呷
239 | 吆
240 | 荀
241 | 裱
242 | 辇
243 | 睫
244 | 伎
245 | 妲
246 | 菠
247 | 鼐
248 | 麾
249 | 芮
250 | 鲑
251 | 辉
252 | 啜
253 | 苞
254 | 踼
255 | 荃
256 | 杞
257 | 浣
258 | 沬
259 | 胤
260 | 恿
261 | 驭
262 | 逵
263 | 钛
264 | 徕
265 | 贮
266 | 蔫
267 | 锚
268 | 衙
269 | 肄
270 | 豺
271 | 闸
272 | 隋
273 | 腑
274 | 脐
275 | 脓
276 | 叱
277 | 迥
278 | 踝
279 | 馥
280 | 佣
281 | 喳
282 | 迩
283 | 贻
284 | 诙
285 | 椭
286 | 琬
287 | 赂
288 | 诧
289 | 苯
290 | 怂
291 | 蟆
292 | 龊
293 | 漳
294 | 迭
295 | 垛
296 | 铲
297 | 馊
298 | 娓
299 | 葆
300 | 赑
301 | 卍
302 | 遽
303 | 谯
304 | 賏
305 | 蛹
306 | 锤
307 | 粟
308 | 衿
309 | 渥
310 | 铳
311 | 刍
312 | 镳
313 | 匮
314 | 万
315 | 骁
316 | 酣
317 | 酉
318 | 骥
319 | 寨
320 | 蓁
321 | 诽
322 | 钡
323 | 浙
324 | 酗
325 | 跩
326 | 拗
327 | 坷
328 | 雱
329 | 闺
330 | 喈
331 | 晔
332 | 螳
333 | 谙
334 | 蹂
335 | 鞑
336 | 蔗
337 | 账
338 | 垚
339 | 瞩
340 | 谩
341 | 掳
342 | 媲
343 | 葾
344 | 鳗
345 | 钣
346 | 檀
347 | 阕
348 | 聿
349 | 蜍
350 | 仆
351 | 嗅
352 | 峥
353 | 蜈
354 | 垠
355 | 蚓
356 | 麓
357 | 殉
358 | 弩
359 | 朴
360 | 胥
361 | 瘴
362 | 篑
363 | 镍
364 | 鹂
365 | 暐
366 | 榷
367 | 咀
368 | 佯
369 | 蚣
370 | 荻
371 | 鬓
372 | 仝
373 | 裴
374 | 讷
375 | 孺
376 | 咨
377 | 俑
378 | 遴
379 | 吽
380 | 笋
381 | 耀
382 | 霾
383 | 绎
384 | 咿
385 | 骸
386 | 霭
387 | 昕
388 | 漩
389 | 浒
390 | 轼
391 | 婿
392 | 嗳
393 | 钙
394 | 谲
395 | 蛾
396 | 跛
397 | 惺
398 | 翎
399 | 炽
400 | 晒
401 | 钳
402 | 鞘
403 | 谚
404 | 钊
405 | 背
406 | 瀛
407 | 槌
408 | 臀
409 | 跋
410 | 窒
411 | 藤
412 | 噬
413 | 蓊
414 | 褐
415 | 蔺
416 | 鲍
417 | 鲨
418 | 舔
419 | 箔
420 | 萦
421 | 诏
422 | 褔
423 | 咄
424 | 俘
425 | 彪
426 | 饪
427 | 嘱
428 | 诬
429 | 踮
430 | 囝
431 | 佢
432 | 汶
433 | 讹
434 | 踅
435 | 咐
436 | 讼
437 | 玟
438 | 迂
439 | 亵
440 | 婵
441 | 馁
442 | 崭
443 | 惦
444 | 蠹
445 | 濒
446 | 匈
447 | 蟋
448 | 谕
449 | 酪
450 | 眛
451 | 煦
452 | 甭
453 | 谄
454 | 妾
455 | 梧
456 | 芜
457 | 蛎
458 | 颐
459 | 雌
460 | 褒
461 | 臼
462 | 圳
463 | 剔
464 | 噶
465 | 耨
466 | 嗈
467 | 勋
468 | 冶
469 | 扑
470 | 膺
471 | 腺
472 | 荤
473 | 坞
474 | 羲
475 | 栾
476 | 傌
477 | 幌
478 | 噗
479 | 蛀
480 | 觞
481 | 塾
482 | 耙
483 | 枭
484 | 擞
485 | 缅
486 | 踌
487 | 蟀
488 | 侥
489 | 诣
490 | 姜
491 | 甸
492 | 俭
493 | 泠
494 | 躇
495 | 萌
496 | 虏
497 | 匕
498 | 藩
499 | 嗽
500 | 蜻
501 | 咛
502 | 艹
503 | 跎
504 | 蔬
505 | 鸠
506 | 跆
507 | 肋
508 | 巅
509 | 芯
510 | 荐
511 | 荼
512 | 慵
513 | 咸
514 | 杭
515 | 樟
516 | 夸
517 | 戮
518 | 吱
519 | 模
520 | 葔
521 | 迢
522 | 砰
523 | 须
524 | 蒜
525 | 骐
526 | 茱
527 | 痊
528 | 蛤
529 | 蜴
530 | 诟
531 | 俾
532 | 疮
533 | 悴
534 | 袒
535 | 蒹
536 | 镖
537 | 娥
538 | 鹉
539 | 婊
540 | 噫
541 | 矜
542 | 岳
543 | 鹦
544 | 葭
545 | 褚
546 | 嵩
547 | 丫
548 | 凛
549 | 峦
550 | 惚
551 | 懊
552 | 韶
553 | 憋
554 | 聋
555 | 讪
556 | 瘫
557 | 霓
558 | 哺
559 | 蝙
560 | 靥
561 | 堇
562 | 铺
563 | 趾
564 | 褪
565 | 缆
566 | 媛
567 | 胧
568 | 肛
569 | 珈
570 | 畴
571 | 驹
572 | 熔
573 | 臆
574 | 肘
575 | 豁
576 | 冕
577 | 吊
578 | 韧
579 | 炜
580 | 舱
581 | 恁
582 | 巳
583 | 舵
584 | 臻
585 | 戊
586 | 稽
587 | 诲
588 | 隽
589 | 铐
590 | 鲫
591 | 畸
592 | 饥
593 | 茉
594 | 蒲
595 | 矶
596 | 峨
597 | 蚵
598 | 蔼
599 | 诛
600 | 焰
601 | 偈
602 | 蚱
603 | 骯
604 | 盔
605 | 巩
606 | 折
607 | 偕
608 | 嗓
609 | 辙
610 | 鸶
611 | 酵
612 | 莘
613 | 耘
614 | 汹
615 | 楞
616 | 陡
617 | 裳
618 | 憎
619 | 讳
620 | 荆
621 | 笃
622 | 屉
623 | 霈
624 | 恬
625 | 蹦
626 | 扬
627 | 侃
628 | 艳
629 | 璇
630 | 韬
631 | 烬
632 | 傀
633 | 铮
634 | 曦
635 | 搂
636 | 蝠
637 | 霄
638 | 胺
639 | 遐
640 | 飨
641 | 郡
642 | 困
643 | 呎
644 | 墅
645 | 鞠
646 | 瘤
647 | 藻
648 | 咆
649 | 踹
650 | 狷
651 | 镀
652 | 桐
653 | 赘
654 | 揽
655 | 炬
656 | 氢
657 | 膛
658 | 搪
659 | 湿
660 | 唆
661 | 兑
662 | 暸
663 | 厮
664 | 懈
665 | 媳
666 | 塘
667 | 靡
668 | 鹭
669 | 祟
670 | 冀
671 | 豚
672 | 蹄
673 | 橙
674 | 阎
675 | 硫
676 | 埠
677 | 噱
678 | 妃
679 | 搓
680 | 啃
681 | 俞
682 | 龚
683 | 橄
684 | 嚎
685 | 椎
686 | 蓦
687 | 朔
688 | 痘
689 | 鳞
690 | 铠
691 | 叽
692 | 跤
693 | 裔
694 | 诃
695 | 岫
696 | 怯
697 | 讥
698 | 聂
699 | 垢
700 | 藐
701 | 濑
702 | 莒
703 | 淇
704 | 毯
705 | 礁
706 | 赃
707 | 庐
708 | 辕
709 | 瞌
710 | 锯
711 | 莓
712 | 涡
713 | 昼
714 | 捌
715 | 嗡
716 | 倌
717 | 禹
718 | 蹋
719 | 卯
720 | 粪
721 | 耽
722 | 闰
723 | 曳
724 | 苔
725 | 诵
726 | 菇
727 | 斟
728 | 芥
729 | 莅
730 | 喀
731 | 麒
732 | 颊
733 | 扛
734 | 曜
735 | 咎
736 | 缮
737 | 诫
738 | 躁
739 | 茜
740 | 缤
741 | 暧
742 | 郄
743 | 酥
744 | 僻
745 | 躬
746 | 峙
747 | 驯
748 | 噎
749 | 厦
750 | 澜
751 | 杏
752 | 樽
753 | 勘
754 | 煤
755 | 茎
756 | 嚷
757 | 昆
758 | 铸
759 | 烘
760 | 邹
761 | 廓
762 | 拚
763 | 俐
764 | 裘
765 | 饵
766 | 恃
767 | 蔓
768 | 笙
769 | 茁
770 | 楷
771 | 嚼
772 | 锻
773 | 蕊
774 | 脖
775 | 茍
776 | 壤
777 | 琮
778 | 莽
779 | 塌
780 | 蚤
781 | 膳
782 | 磋
783 | 蓓
784 | 澈
785 | 萎
786 | 擒
787 | 禄
788 | 儡
789 | 懦
790 | 瞻
791 | 虔
792 | 粥
793 | 赦
794 | 畜
795 | 彷
796 | 寥
797 | 揣
798 | 嫖
799 | 朽
800 | 挂
801 | 啄
802 | 浇
803 | 崖
804 | 棠
805 | 禽
806 | 台
807 | 邂
808 | 矫
809 | 茅
810 | 惫
811 | 吠
812 | 苟
813 | 叩
814 | 徊
815 | 巍
816 | 舆
817 | 邵
818 | 彗
819 | 萃
820 | 拱
821 | 嘶
822 | 貂
823 | 趴
824 | 愿
825 | 脊
826 | 冗
827 | 杆
828 | 蕙
829 | 铎
830 | 囚
831 | 啼
832 | 谤
833 | 徘
834 | 芹
835 | 骆
836 | 夭
837 | 饺
838 | 馒
839 | 溺
840 | 咫
841 | 屐
842 | 绅
843 | 诅
844 | 缉
845 | 渣
846 | 敞
847 | 萱
848 | 丰
849 | 俏
850 | 螃
851 | 蜀
852 | 徽
853 | 逞
854 | 跪
855 | 虞
856 | 隙
857 | 匀
858 | 憧
859 | 辄
860 | 鸳
861 | 疵
862 | 跷
863 | 呱
864 | 穆
865 | 阑
866 | 搏
867 | 肾
868 | 靶
869 | 阱
870 | 囡
871 | 寰
872 | 庄
873 | 蟾
874 | 怠
875 | 腕
876 | 烟
877 | 巾
878 | 奢
879 | 垄
880 | 姨
881 | 躯
882 | 肺
883 | 钰
884 | 佰
885 | 阙
886 | 雏
887 | 溉
888 | 焚
889 | 丑
890 | 锥
891 | 诘
892 | 瞪
893 | 茹
894 | 绊
895 | 蚀
896 | 袱
897 | 煽
898 | 窕
899 | 掷
900 | 沮
901 | 钞
902 | 涕
903 | 浏
904 | 仄
905 | 孰
906 | 峻
907 | 皱
908 | 芦
909 | 膏
910 | 晰
911 | 衬
912 | 谍
913 | 丞
914 | 绽
915 | 蔽
916 | 呕
917 | 轿
918 | 隶
919 | 楠
920 | 匣
921 | 葵
922 | 沫
923 | 刃
924 | 禧
925 | 晦
926 | 哔
927 | 晖
928 | 绣
929 | 仟
930 | 窟
931 | 谛
932 | 瀚
933 | 黛
934 | 忿
935 | 姚
936 | 蜘
937 | 耸
938 | 捍
939 | 斐
940 | 卜
941 | 辗
942 | 刁
943 | 涅
944 | 泓
945 | 梵
946 | 扳
947 | 暇
948 | 袜
949 | 柠
950 | 傍
951 | 逮
952 | 呃
953 | 蜗
954 | 窍
955 | 琉
956 | 喃
957 | 溢
958 | 抉
959 | 旷
960 | 卅
961 | 亟
962 | 膝
963 | 伶
964 | 闇
965 | 莺
966 | 蔚
967 | 醋
968 | 瑛
969 | 拭
970 | 绮
971 | 鑫
972 | 圭
973 | 脂
974 | 酿
975 | 诈
976 | 膨
977 | 隧
978 | 惭
979 | 庚
980 | 衅
981 | 哨
982 | 凋
983 | 里
984 | 祯
985 | 撼
986 | 谭
987 | 稻
988 | 迋
989 | 碌
990 | 罕
991 | 逾
992 | 嗜
993 | 蹲
994 | 檬
995 | 肖
996 | 辖
997 | 襟
998 | 扎
999 | 槟
1000 | 缔
1001 | 袂
1002 | 敷
1003 | 腥
1004 | 喘
1005 | 簿
1006 | 鳖
1007 | 出
1008 | 噢
1009 | 炫
1010 | 佑
1011 | 贷
1012 | 粮
1013 | 荳
1014 | 桦
1015 | 颉
1016 | 哑
1017 | 倪
1018 | 颤
1019 | 御
1020 | 芽
1021 | 朦
1022 | 裹
1023 | 贬
1024 | 蕉
1025 | 蝉
1026 | 赎
1027 | 崔
1028 | 滔
1029 | 茵
1030 | 径
1031 | 克
1032 | 啤
1033 | 拯
1034 | 坟
1035 | 葱
1036 | 芋
1037 | 瞒
1038 | 掠
1039 | 绳
1040 | 蛛
1041 | 匠
1042 | 凸
1043 | 苛
1044 | 押
1045 | 楣
1046 | 芙
1047 | 酌
1048 | 俺
1049 | 掏
1050 | 倡
1051 | 唾
1052 | 瞄
1053 | 磊
1054 | 吼
1055 | 搅
1056 | 溃
1057 | 聆
1058 | 沌
1059 | 蝇
1060 | 鸥
1061 | 妒
1062 | 焕
1063 | 拙
1064 | 夷
1065 | 迄
1066 | 绰
1067 | 锵
1068 | 耿
1069 | 祺
1070 | 吶
1071 | 惶
1072 | 廊
1073 | 兜
1074 | 倩
1075 | 杖
1076 | 窄
1077 | 僚
1078 | 竖
1079 | 芷
1080 | 咚
1081 | 鲢
1082 | 沛
1083 | 挪
1084 | 柄
1085 | 顷
1086 | 璞
1087 | 裸
1088 | 鵰
1089 | 郊
1090 | 屿
1091 | 仕
1092 | 艘
1093 | 铅
1094 | 铝
1095 | 饲
1096 | 黯
1097 | 疫
1098 | 栽
1099 | 喉
1100 | 逗
1101 | 祇
1102 | 阪
1103 | 侍
1104 | 抒
1105 | 弗
1106 | 尬
1107 | 浦
1108 | 鄙
1109 | 盏
1110 | 喽
1111 | 炳
1112 | 卵
1113 | 肌
1114 | 迦
1115 | 擅
1116 | 豹
1117 | 胏
1118 | 炼
1119 | 悸
1120 | 谴
1121 | 贾
1122 | 胀
1123 | 疋
1124 | 矿
1125 | 梨
1126 | 碑
1127 | 髓
1128 | 巢
1129 | 叹
1130 | 屡
1131 | 滩
1132 | 侮
1133 | 橘
1134 | 嘲
1135 | 酬
1136 | 枚
1137 | 氓
1138 | 菌
1139 | 颁
1140 | 萝
1141 | 谘
1142 | 曝
1143 | 薯
1144 | 襄
1145 | 辽
1146 | 萄
1147 | 寇
1148 | 舜
1149 | 颂
1150 | 撰
1151 | 腻
1152 | 崩
1153 | 咕
1154 | 癌
1155 | 歇
1156 | 汰
1157 | 烁
1158 | 撇
1159 | 宴
1160 | 惩
1161 | 烛
1162 | 贰
1163 | 呻
1164 | 呒
1165 | 翩
1166 | 绑
1167 | 捞
1168 | 爹
1169 | 秉
1170 | 棉
1171 | 妓
1172 | 尉
1173 | 霍
1174 | 甫
1175 | 尝
1176 | 葡
1177 | 蒸
1178 | 鸦
1179 | 挚
1180 | 奸
1181 | 纬
1182 | 艰
1183 | 履
1184 | 葬
1185 | 滨
1186 | 耕
1187 | 婴
1188 | 醇
1189 | 堵
1190 | 钉
1191 | 喧
1192 | 遂
1193 | 锣
1194 | 垮
1195 | 蓬
1196 | 薛
1197 | 虐
1198 | 睁
1199 | 厨
1200 | 娶
1201 | 浆
1202 | 挨
1203 | 矢
1204 | 蕾
1205 | 伺
1206 | 券
1207 | 鹏
1208 | 削
1209 | 蓄
1210 | 琦
1211 | 熄
1212 | 湘
1213 | 慌
1214 | 枕
1215 | 衍
1216 | 薇
1217 | 囊
1218 | 喂
1219 | 蕴
1220 | 倘
1221 | 峡
1222 | 浊
1223 | 窃
1224 | 颈
1225 | 裙
1226 | 晕
1227 | 缚
1228 | 获
1229 | 帕
1230 | 脾
1231 | 莹
1232 | 逍
1233 | 姬
1234 | 韦
1235 | 畔
1236 | 伐
1237 | 霞
1238 | 嘘
1239 | 盐
1240 | 摧
1241 | 债
1242 | 佩
1243 | 畏
1244 | 驴
1245 | 氧
1246 | 奴
1247 | 瘦
1248 | 菊
1249 | 廿
1250 | 狭
1251 | 赴
1252 | 碳
1253 | 坊
1254 | 盆
1255 | 趟
1256 | 匿
1257 | 肇
1258 | 溶
1259 | 揭
1260 | 剥
1261 | 沦
1262 | 秃
1263 | 郝
1264 | 唔
1265 | 锡
1266 | 娇
1267 | 抚
1268 | 屎
1269 | 甩
1270 | 娱
1271 | 表
1272 | 犬
1273 | 魁
1274 | 蒂
1275 | 皓
1276 | 祷
1277 | 瞎
1278 | 瘾
1279 | 煎
1280 | 螺
1281 | 遮
1282 | 坠
1283 | 剎
1284 | 筝
1285 | 棵
1286 | 冤
1287 | 崎
1288 | 昔
1289 | 驼
1290 | 竿
1291 | 甄
1292 | 斑
1293 | 歹
1294 | 骏
1295 | 缝
1296 | 鞭
1297 | 垫
1298 | 淹
1299 | 并
1300 | 遨
1301 | 宠
1302 | 掰
1303 | 枯
1304 | 艇
1305 | 豫
1306 | 募
1307 | 郁
1308 | 稚
1309 | 懿
1310 | 辐
1311 | 酱
1312 | 恕
1313 | 范
1314 | 涂
1315 | 滤
1316 | 肃
1317 | 膜
1318 | 佬
1319 | 哼
1320 | 慨
1321 | 穗
1322 | 辰
1323 | 雁
1324 | 瑟
1325 | 帆
1326 | 拢
1327 | 汁
1328 | 蝴
1329 | 冈
1330 | 诠
1331 | 蹈
1332 | 黏
1333 | 痞
1334 | 屑
1335 | 潇
1336 | 觅
1337 | 钧
1338 | 挣
1339 | 谐
1340 | 霜
1341 | 诊
1342 | 熬
1343 | 讽
1344 | 歧
1345 | 戈
1346 | 闯
1347 | 饶
1348 | 斤
1349 | 婉
1350 | 致
1351 | 贿
1352 | 苑
1353 | 矮
1354 | 毋
1355 | 詹
1356 | 祈
1357 | 咳
1358 | 昱
1359 | 佐
1360 | 帖
1361 | 猩
1362 | 尹
1363 | 诇
1364 | 肆
1365 | 亭
1366 | 丘
1367 | 淘
1368 | 颠
1369 | 勃
1370 | 讶
1371 | 抖
1372 | 袁
1373 | 柱
1374 | 僧
1375 | 蚊
1376 | 匹
1377 | 辣
1378 | 螂
1379 | 澡
1380 | 昧
1381 | 诡
1382 | 槽
1383 | 穴
1384 | 斩
1385 | 聘
1386 | 扶
1387 | 熙
1388 | 驰
1389 | 棍
1390 | 兆
1391 | 蟑
1392 | 矩
1393 | 谬
1394 | 贫
1395 | 鼎
1396 | 践
1397 | 盲
1398 | 眷
1399 | 尿
1400 | 伫
1401 | 饿
1402 | 砸
1403 | 妄
1404 | 荡
1405 | 炒
1406 | 冥
1407 | 偿
1408 | 墓
1409 | 骄
1410 | 毙
1411 | 淋
1412 | 芝
1413 | 胃
1414 | 宅
1415 | 董
1416 | 梭
1417 | 凑
1418 | 宰
1419 | 卑
1420 | 丛
1421 | 纠
1422 | 肢
1423 | 闽
1424 | 铜
1425 | 寺
1426 | 瞬
1427 | 澳
1428 | 庞
1429 | 腔
1430 | 泼
1431 | 昂
1432 | 梁
1433 | 躺
1434 | 姻
1435 | 潭
1436 | 吋
1437 | 撤
1438 | 殖
1439 | 轴
1440 | 颖
1441 | 冻
1442 | 琼
1443 | 恳
1444 | 衫
1445 | 譬
1446 | 猎
1447 | 衰
1448 | 桶
1449 | 辜
1450 | 筒
1451 | 赫
1452 | 仗
1453 | 膀
1454 | 乳
1455 | 嚣
1456 | 划
1457 | 玮
1458 | 卿
1459 | 枉
1460 | 埃
1461 | 跨
1462 | 粹
1463 | 猴
1464 | 愤
1465 | 壹
1466 | 卢
1467 | 尧
1468 | 翰
1469 | 叮
1470 | 媚
1471 | 钮
1472 | 袖
1473 | 斌
1474 | 卓
1475 | 粽
1476 | 雀
1477 | 谦
1478 | 傅
1479 | 殿
1480 | 睹
1481 | 菁
1482 | 桂
1483 | 诱
1484 | 舌
1485 | 惟
1486 | 岗
1487 | 衷
1488 | 屈
1489 | 陋
1490 | 陌
1491 | 宵
1492 | 麟
1493 | 魏
1494 | 贸
1495 | 几
1496 | 埔
1497 | 谎
1498 | 袍
1499 | 卸
1500 | 仓
1501 | 匪
1502 | 叛
1503 | 肠
1504 | 肝
1505 | 俄
1506 | 孕
1507 | 庙
1508 | 嫁
1509 | 肤
1510 | 拦
1511 | 羯
1512 | 匙
1513 | 咏
1514 | 蠢
1515 | 纽
1516 | 拘
1517 | 旨
1518 | 胁
1519 | 馨
1520 | 珊
1521 | 签
1522 | 赔
1523 | 秩
1524 | 喻
1525 | 谜
1526 | 翠
1527 | 芭
1528 | 摊
1529 | 侣
1530 | 灿
1531 | 寡
1532 | 罐
1533 | 贼
1534 | 叙
1535 | 谨
1536 | 体
1537 | 敲
1538 | 浴
1539 | 吻
1540 | 臂
1541 | 袭
1542 | 煮
1543 | 腹
1544 | 暮
1545 | 曹
1546 | 虹
1547 | 抑
1548 | 贩
1549 | 踩
1550 | 澎
1551 | 糖
1552 | 催
1553 | 萍
1554 | 垂
1555 | 斥
1556 | 侬
1557 | 拷
1558 | 唤
1559 | 匆
1560 | 阮
1561 | 飙
1562 | 柴
1563 | 剂
1564 | 妖
1565 | 添
1566 | 畅
1567 | 汗
1568 | 鸭
1569 | 稀
1570 | 晋
1571 | 埋
1572 | 弊
1573 | 返
1574 | 叡
1575 | 娟
1576 | 玻
1577 | 腾
1578 | 栋
1579 | 歪
1580 | 邓
1581 | 渴
1582 | 粒
1583 | 泣
1584 | 疾
1585 | 蓉
1586 | 塑
1587 | 祂
1588 | 储
1589 | 劣
1590 | 柯
1591 | 陶
1592 | 患
1593 | 蛇
1594 | 腐
1595 | 琳
1596 | 慎
1597 | 泊
1598 | 牢
1599 | 呈
1600 | 趁
1601 | 恶
1602 | 浑
1603 | 扮
1604 | 樱
1605 | 臣
1606 | 遵
1607 | 缠
1608 | 虫
1609 | 撒
1610 | 叉
1611 | 刑
1612 | 苗
1613 | 脉
1614 | 盈
1615 | 津
1616 | 愧
1617 | 摔
1618 | 盒
1619 | 丧
1620 | 鹤
1621 | 呦
1622 | 厕
1623 | 斜
1624 | 芒
1625 | 翅
1626 | 悄
1627 | 晃
1628 | 茂
1629 | 寸
1630 | 杉
1631 | 旺
1632 | 俩
1633 | 雯
1634 | 霖
1635 | 递
1636 | 胶
1637 | 氛
1638 | 谣
1639 | 捉
1640 | 虾
1641 | 秘
1642 | 漠
1643 | 扭
1644 | 贞
1645 | 陵
1646 | 叔
1647 | 轨
1648 | 鹅
1649 | 液
1650 | 妥
1651 | 贱
1652 | 涨
1653 | 滥
1654 | 痕
1655 | 沿
1656 | 秤
1657 | 措
1658 | 巡
1659 | 丈
1660 | 魅
1661 | 欲
1662 | 缸
1663 | 鹿
1664 | 汝
1665 | 迁
1666 | 矣
1667 | 肩
1668 | 烤
1669 | 笛
1670 | 迅
1671 | 劫
1672 | 趋
1673 | 披
1674 | 荷
1675 | 卒
1676 | 丙
1677 | 碗
1678 | 伙
1679 | 椅
1680 | 赞
1681 | 侦
1682 | 灾
1683 | 秦
1684 | 蛙
1685 | 禅
1686 | 慰
1687 | 余
1688 | 朗
1689 | 辱
1690 | 征
1691 | 愚
1692 | 抛
1693 | 挺
1694 | 彭
1695 | 允
1696 | 靖
1697 | 滋
1698 | 凝
1699 | 赠
1700 | 莎
1701 | 顽
1702 | 狠
1703 | 堕
1704 | 翘
1705 | 惹
1706 | 纲
1707 | 贯
1708 | 饼
1709 | 抬
1710 | 逆
1711 | 堪
1712 | 坤
1713 | 斗
1714 | 钦
1715 | 疏
1716 | 羞
1717 | 扇
1718 | 蜂
1719 | 赌
1720 | 驻
1721 | 屏
1722 | 爵
1723 | 轰
1724 | 契
1725 | 悦
1726 | 邻
1727 | 哉
1728 | 陀
1729 | 裂
1730 | 刷
1731 | 毅
1732 | 拾
1733 | 疼
1734 | 阔
1735 | 耍
1736 | 亏
1737 | 吟
1738 | 锐
1739 | 惧
1740 | 锅
1741 | 蝶
1742 | 壳
1743 | 糕
1744 | 舟
1745 | 牧
1746 | 妮
1747 | 粗
1748 | 仇
1749 | 驶
1750 | 促
1751 | 孝
1752 | 裤
1753 | 誉
1754 | 家
1755 | 迈
1756 | 姿
1757 | 踪
1758 | 兔
1759 | 综
1760 | 旭
1761 | 韵
1762 | 齿
1763 | 乔
1764 | 怖
1765 | 晴
1766 | 闷
1767 | 墨
1768 | 咬
1769 | 侧
1770 | 狱
1771 | 琪
1772 | 梯
1773 | 宾
1774 | 枫
1775 | 锦
1776 | 瑜
1777 | 敦
1778 | 矛
1779 | 弘
1780 | 玛
1781 | 茫
1782 | 迪
1783 | 览
1784 | 挤
1785 | 雳
1786 | 岚
1787 | 卷
1788 | 黎
1789 | 薄
1790 | 柳
1791 | 咦
1792 | 廷
1793 | 瞧
1794 | 幅
1795 | 挖
1796 | 唬
1797 | 侯
1798 | 祸
1799 | 饰
1800 | 儒
1801 | 捡
1802 | 筋
1803 | 融
1804 | 耗
1805 | 铃
1806 | 奉
1807 | 鼻
1808 | 坜
1809 | 曼
1810 | 贡
1811 | 嗨
1812 | 炎
1813 | 啡
1814 | 捐
1815 | 炮
1816 | 霹
1817 | 貌
1818 | 鸣
1819 | 饱
1820 | 廉
1821 | 绘
1822 | 咪
1823 | 吝
1824 | 肚
1825 | 云
1826 | 翼
1827 | 氏
1828 | 骚
1829 | 爷
1830 | 寿
1831 | 绕
1832 | 唷
1833 | 牺
1834 | 屠
1835 | 谋
1836 | 彻
1837 | 俱
1838 | 粉
1839 | 雾
1840 | 涵
1841 | 侨
1842 | 础
1843 | 疗
1844 | 署
1845 | 稿
1846 | 涉
1847 | 稣
1848 | 誓
1849 | 箭
1850 | 涯
1851 | 锺
1852 | 迹
1853 | 抄
1854 | 踢
1855 | 贪
1856 | 咖
1857 | 莱
1858 | 夺
1859 | 勉
1860 | 焦
1861 | 蒋
1862 | 桑
1863 | 沧
1864 | 恰
1865 | 泳
1866 | 牲
1867 | 戒
1868 | 恼
1869 | 夕
1870 | 棚
1871 | 爬
1872 | 菲
1873 | 翁
1874 | 奔
1875 | 滴
1876 | 玄
1877 | 捷
1878 | 曰
1879 | 愉
1880 | 逊
1881 | 憾
1882 | 钓
1883 | 壁
1884 | 躲
1885 | 嫌
1886 | 姆
1887 | 乏
1888 | 洛
1889 | 逼
1890 | 磨
1891 | 剪
1892 | 逝
1893 | 亨
1894 | 盼
1895 | 杯
1896 | 敝
1897 | 碍
1898 | 痴
1899 | 植
1900 | 瑰
1901 | 勤
1902 | 悟
1903 | 彬
1904 | 删
1905 | 薪
1906 | 悠
1907 | 胎
1908 | 侵
1909 | 坪
1910 | 赋
1911 | 弯
1912 | 丹
1913 | 巫
1914 | 轩
1915 | 辨
1916 | 吐
1917 | 么
1918 | 盾
1919 | 扯
1920 | 割
1921 | 艾
1922 | 幼
1923 | 捕
1924 | 召
1925 | 怒
1926 | 坡
1927 | 缓
1928 | 猛
1929 | 驾
1930 | 莉
1931 | 彦
1932 | 韩
1933 | 鞋
1934 | 碧
1935 | 泽
1936 | 泉
1937 | 缴
1938 | 跃
1939 | 喇
1940 | 腿
1941 | 糟
1942 | 胆
1943 | 摘
1944 | 朵
1945 | 逛
1946 | 甜
1947 | 拔
1948 | 劲
1949 | 悉
1950 | 穷
1951 | 汤
1952 | 唐
1953 | 臭
1954 | 玲
1955 | 怡
1956 | 舍
1957 | 欺
1958 | 蜜
1959 | 耻
1960 | 坦
1961 | 叭
1962 | 亿
1963 | 忌
1964 | 鲁
1965 | 繁
1966 | 泥
1967 | 伸
1968 | 壮
1969 | 串
1970 | 圾
1971 | 币
1972 | 荒
1973 | 垃
1974 | 妇
1975 | 旦
1976 | 截
1977 | 喷
1978 | 碎
1979 | 吕
1980 | 犹
1981 | 抹
1982 | 脆
1983 | 煞
1984 | 胞
1985 | 晶
1986 | 潜
1987 | 玫
1988 | 妻
1989 | 估
1990 | 陷
1991 | 孔
1992 | 娃
1993 | 兽
1994 | 肥
1995 | 凉
1996 | 岂
1997 | 逻
1998 | 胸
1999 | 杜
2000 | 袋
2001 | 甘
2002 | 邀
2003 | 培
2004 | 龄
2005 | 辆
2006 | 廖
2007 | 冲
2008 | 渡
2009 | 羽
2010 | 秒
2011 | 辞
2012 | 倾
2013 | 窝
2014 | 柏
2015 | 淑
2016 | 诞
2017 | 漏
2018 | 姑
2019 | 托
2020 | 吾
2021 | 纷
2022 | 拆
2023 | 浩
2024 | 税
2025 | 邱
2026 | 迟
2027 | 筹
2028 | 监
2029 | 汪
2030 | 擎
2031 | 衡
2032 | 狐
2033 | 灰
2034 | 尖
2035 | 番
2036 | 罚
2037 | 证
2038 | 盗
2039 | 祥
2040 | 毫
2041 | 彰
2042 | 扩
2043 | 幽
2044 | 阐
2045 | 喊
2046 | 菩
2047 | 赐
2048 | 奋
2049 | 鲜
2050 | 劝
2051 | 栏
2052 | 慈
2053 | 扫
2054 | 尽
2055 | 穹
2056 | 丌
2057 | 绪
2058 | 砂
2059 | 勿
2060 | 抢
2061 | 啪
2062 | 庸
2063 | 赤
2064 | 饮
2065 | 萨
2066 | 兼
2067 | 访
2068 | 舒
2069 | 裕
2070 | 逸
2071 | 宙
2072 | 丸
2073 | 准
2074 | 魂
2075 | 厚
2076 | 励
2077 | 仰
2078 | 糊
2079 | 顿
2080 | 闭
2081 | 塔
2082 | 枪
2083 | 睛
2084 | 斋
2085 | 奥
2086 | 恭
2087 | 翔
2088 | 遥
2089 | 航
2090 | 孟
2091 | 昌
2092 | 卧
2093 | 颇
2094 | 革
2095 | 邪
2096 | 阻
2097 | 蟹
2098 | 裁
2099 | 后
2100 | 函
2101 | 于
2102 | 拳
2103 | 宽
2104 | 锋
2105 | 州
2106 | 葛
2107 | 拒
2108 | 池
2109 | 镇
2110 | 芬
2111 | 岸
2112 | 寞
2113 | 凭
2114 | 姊
2115 | 殊
2116 | 板
2117 | 勒
2118 | 慕
2119 | 跌
2120 | 踏
2121 | 填
2122 | 陪
2123 | 逐
2124 | 洽
2125 | 描
2126 | 妨
2127 | 仪
2128 | 摄
2129 | 紫
2130 | 谅
2131 | 阅
2132 | 邦
2133 | 麦
2134 | 莲
2135 | 闪
2136 | 纵
2137 | 庭
2138 | 圈
2139 | 榜
2140 | 滑
2141 | 舰
2142 | 面
2143 | 献
2144 | 浅
2145 | 飘
2146 | 宋
2147 | 俗
2148 | 沟
2149 | 巷
2150 | 眠
2151 | 帽
2152 | 惑
2153 | 羊
2154 | 牵
2155 | 净
2156 | 厉
2157 | 撞
2158 | 崇
2159 | 竞
2160 | 回
2161 | 乙
2162 | 聪
2163 | 桃
2164 | 伍
2165 | 役
2166 | 潮
2167 | 损
2168 | 凯
2169 | 锁
2170 | 震
2171 | 醉
2172 | 屁
2173 | 牠
2174 | 孙
2175 | 酷
2176 | 染
2177 | 尺
2178 | 摸
2179 | 盛
2180 | 闹
2181 | 棋
2182 | 吓
2183 | 迫
2184 | 瓜
2185 | 松
2186 | 搬
2187 | 戴
2188 | 瞭
2189 | 乌
2190 | 谱
2191 | 滚
2192 | 赚
2193 | 障
2194 | 逃
2195 | 齐
2196 | 牙
2197 | 怨
2198 | 拖
2199 | 皇
2200 | 贺
2201 | 横
2202 | 塞
2203 | 摆
2204 | 农
2205 | 倍
2206 | 额
2207 | 乘
2208 | 户
2209 | 奈
2210 | 川
2211 | 徐
2212 | 井
2213 | 寝
2214 | 洞
2215 | 劳
2216 | 船
2217 | 域
2218 | 屋
2219 | 胖
2220 | 藉
2221 | 销
2222 | 拼
2223 | 桌
2224 | 忧
2225 | 违
2226 | 拟
2227 | 吵
2228 | 媒
2229 | 辩
2230 | 妙
2231 | 鸿
2232 | 恩
2233 | 映
2234 | 耳
2235 | 傻
2236 | 京
2237 | 搭
2238 | 残
2239 | 稍
2240 | 颜
2241 | 固
2242 | 眉
2243 | 龟
2244 | 哀
2245 | 发
2246 | 沈
2247 | 拨
2248 | 丁
2249 | 愁
2250 | 耐
2251 | 宪
2252 | 覆
2253 | 盟
2254 | 昭
2255 | 握
2256 | 萧
2257 | 延
2258 | 豆
2259 | 弱
2260 | 隆
2261 | 页
2262 | 烧
2263 | 遍
2264 | 距
2265 | 摩
2266 | 祖
2267 | 探
2268 | 倚
2269 | 寂
2270 | 阴
2271 | 悔
2272 | 库
2273 | 嘴
2274 | 沉
2275 | 伊
2276 | 暂
2277 | 霸
2278 | 喵
2279 | 频
2280 | 鼓
2281 | 冒
2282 | 鼠
2283 | 企
2284 | 副
2285 | 菜
2286 | 款
2287 | 忽
2288 | 尾
2289 | 租
2290 | 椰
2291 | 隔
2292 | 狼
2293 | 浮
2294 | 惠
2295 | 峰
2296 | 索
2297 | 芳
2298 | 摇
2299 | 洪
2300 | 伦
2301 | 骨
2302 | 吹
2303 | 郑
2304 | 哩
2305 | 珍
2306 | 纳
2307 | 零
2308 | 哲
2309 | 遭
2310 | 瓶
2311 | 亡
2312 | 振
2313 | 予
2314 | 村
2315 | 旅
2316 | 惨
2317 | 汽
2318 | 爸
2319 | 隐
2320 | 械
2321 | 寒
2322 | 危
2323 | 邮
2324 | 贝
2325 | 阶
2326 | 赖
2327 | 茶
2328 | 谊
2329 | 涛
2330 | 惯
2331 | 尘
2332 | 丝
2333 | 森
2334 | 询
2335 | 露
2336 | 稳
2337 | 桥
2338 | 夏
2339 | 哭
2340 | 坚
2341 | 籍
2342 | 厌
2343 | 苍
2344 | 析
2345 | 冰
2346 | 仙
2347 | 布
2348 | 箱
2349 | 脱
2350 | 贤
2351 | 途
2352 | 订
2353 | 财
2354 | 欧
2355 | 赢
2356 | 枢
2357 | 泪
2358 | 废
2359 | 钢
2360 | 渐
2361 | 泡
2362 | 刊
2363 | 肯
2364 | 恨
2365 | 砍
2366 | 抽
2367 | 股
2368 | 咧
2369 | 婆
2370 | 禁
2371 | 郎
2372 | 默
2373 | 符
2374 | 缩
2375 | 童
2376 | 绿
2377 | 骗
2378 | 辈
2379 | 尼
2380 | 届
2381 | 彼
2382 | 兮
2383 | 聚
2384 | 宇
2385 | 辛
2386 | 疯
2387 | 减
2388 | 米
2389 | 念
2390 | 降
2391 | 街
2392 | 临
2393 | 敏
2394 | 洗
2395 | 玉
2396 | 伴
2397 | 辅
2398 | 诺
2399 | 鸡
2400 | 侠
2401 | 健
2402 | 熊
2403 | 顶
2404 | 挑
2405 | 替
2406 | 豪
2407 | 掌
2408 | 饭
2409 | 银
2410 | 圆
2411 | 志
2412 | 休
2413 | 材
2414 | 灭
2415 | 烈
2416 | 爆
2417 | 透
2418 | 遗
2419 | 虚
2420 | 醒
2421 | 货
2422 | 雅
2423 | 宏
2424 | 帅
2425 | 宫
2426 | 港
2427 | 偶
2428 | 丢
2429 | 篮
2430 | 凡
2431 | 瑞
2432 | 硕
2433 | 雪
2434 | 忠
2435 | 蔡
2436 | 插
2437 | 积
2438 | 乖
2439 | 挥
2440 | 抗
2441 | 察
2442 | 末
2443 | 盖
2444 | 厅
2445 | 移
2446 | 吸
2447 | 括
2448 | 笨
2449 | 孤
2450 | 译
2451 | 避
2452 | 秀
2453 | 富
2454 | 漂
2455 | 柔
2456 | 私
2457 | 围
2458 | 狮
2459 | 祝
2460 | 庆
2461 | 序
2462 | 拥
2463 | 洲
2464 | 徒
2465 | 借
2466 | 晓
2467 | 嘉
2468 | 诗
2469 | 淡
2470 | 束
2471 | 姓
2472 | 颗
2473 | 勇
2474 | 犯
2475 | 喝
2476 | 食
2477 | 镜
2478 | 偏
2479 | 猜
2480 | 层
2481 | 帐
2482 | 仅
2483 | 购
2484 | 衣
2485 | 申
2486 | 伯
2487 | 紧
2488 | 县
2489 | 婚
2490 | 季
2491 | 敬
2492 | 弃
2493 | 尊
2494 | 蛋
2495 | 鹰
2496 | 熟
2497 | 冠
2498 | 唯
2499 | 混
2500 | 藏
2501 | 河
2502 | 忍
2503 | 窗
2504 | 朝
2505 | 轮
2506 | 册
2507 | 乡
2508 | 敌
2509 | 散
2510 | 沙
2511 | 幻
2512 | 短
2513 | 略
2514 | 批
2515 | 游
2516 | 奖
2517 | 岛
2518 | 逢
2519 | 脸
2520 | 顾
2521 | 督
2522 | 协
2523 | 雷
2524 | 详
2525 | 穿
2526 | 慧
2527 | 巧
2528 | 罢
2529 | 呼
2530 | 暗
2531 | 贴
2532 | 纸
2533 | 歉
2534 | 郭
2535 | 努
2536 | 担
2537 | 蓝
2538 | 训
2539 | 享
2540 | 架
2541 | 济
2542 | 猪
2543 | 派
2544 | 均
2545 | 妈
2546 | 哦
2547 | 宣
2548 | 检
2549 | 鬼
2550 | 灯
2551 | 策
2552 | 梅
2553 | 启
2554 | 嘿
2555 | 洋
2556 | 伟
2557 | 萤
2558 | 磁
2559 | 啰
2560 | 付
2561 | 弄
2562 | 寄
2563 | 钟
2564 | 播
2565 | 险
2566 | 载
2567 | 赏
2568 | 汉
2569 | 块
2570 | 刀
2571 | 铭
2572 | 施
2573 | 卫
2574 | 弹
2575 | 售
2576 | 叶
2577 | 皆
2578 | 罪
2579 | 虎
2580 | 归
2581 | 毛
2582 | 昨
2583 | 荣
2584 | 律
2585 | 树
2586 | 奏
2587 | 注
2588 | 扁
2589 | 笔
2590 | 旁
2591 | 键
2592 | 制
2593 | 莫
2594 | 堆
2595 | 射
2596 | 承
2597 | 波
2598 | 皮
2599 | 释
2600 | 判
2601 | 含
2602 | 既
2603 | 退
2604 | 纪
2605 | 刻
2606 | 肉
2607 | 靠
2608 | 麻
2609 | 湖
2610 | 继
2611 | 诚
2612 | 姐
2613 | 益
2614 | 置
2615 | 惜
2616 | 艺
2617 | 尚
2618 | 纯
2619 | 骂
2620 | 琴
2621 | 漫
2622 | 援
2623 | 缺
2624 | 诸
2625 | 尤
2626 | 忆
2627 | 景
2628 | 府
2629 | 委
2630 | 刘
2631 | 绍
2632 | 虑
2633 | 暴
2634 | 草
2635 | 充
2636 | 授
2637 | 防
2638 | 素
2639 | 房
2640 | 搞
2641 | 典
2642 | 仔
2643 | 父
2644 | 吉
2645 | 招
2646 | 剑
2647 | 脚
2648 | 突
2649 | 牌
2650 | 餐
2651 | 仁
2652 | 酒
2653 | 礼
2654 | 巴
2655 | 丽
2656 | 亮
2657 | 恐
2658 | 述
2659 | 周
2660 | 杂
2661 | 旧
2662 | 套
2663 | 赵
2664 | 堂
2665 | 创
2666 | 母
2667 | 辑
2668 | 络
2669 | 俊
2670 | 毒
2671 | 威
2672 | 冷
2673 | 蛮
2674 | 普
2675 | 登
2676 | 微
2677 | 控
2678 | 爽
2679 | 香
2680 | 坐
2681 | 缘
2682 | 幕
2683 | 兰
2684 | 悲
2685 | 势
2686 | 午
2687 | 睡
2688 | 密
2689 | 垒
2690 | 警
2691 | 宗
2692 | 严
2693 | 阵
2694 | 江
2695 | 亚
2696 | 攻
2697 | 静
2698 | 抱
2699 | 啥
2700 | 急
2701 | 宿
2702 | 剧
2703 | 词
2704 | 忙
2705 | 牛
2706 | 吴
2707 | 陆
2708 | 维
2709 | 激
2710 | 增
2711 | 聊
2712 | 浪
2713 | 状
2714 | 良
--------------------------------------------------------------------------------
/licenses/lucene-LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
204 |
205 |
206 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
207 | derived from unicode conversion examples available at
208 | http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright
209 | from those sources:
210 |
211 | /*
212 | * Copyright 2001-2004 Unicode, Inc.
213 | *
214 | * Disclaimer
215 | *
216 | * This source code is provided as is by Unicode, Inc. No claims are
217 | * made as to fitness for any particular purpose. No warranties of any
218 | * kind are expressed or implied. The recipient agrees to determine
219 | * applicability of information provided. If this file has been
220 | * purchased on magnetic or optical media from Unicode, Inc., the
221 | * sole remedy for any claim will be exchange of defective media
222 | * within 90 days of receipt.
223 | *
224 | * Limitations on Rights to Redistribute This Code
225 | *
226 | * Unicode, Inc. hereby grants the right to freely use the information
227 | * supplied in this file in the creation of products supporting the
228 | * Unicode Standard, and to make copies of this file in any form
229 | * for internal or external distribution as long as this notice
230 | * remains attached.
231 | */
232 |
233 |
234 | Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was
235 | derived from Python 2.4.2 sources available at
236 | http://www.python.org. Full license is here:
237 |
238 | http://www.python.org/download/releases/2.4.2/license/
239 |
240 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
241 | derived from Python 3.1.2 sources available at
242 | http://www.python.org. Full license is here:
243 |
244 | http://www.python.org/download/releases/3.1.2/license/
245 |
246 | Some code in core/src/java/org/apache/lucene/util/automaton was
247 | derived from Brics automaton sources available at
248 | www.brics.dk/automaton/. Here is the copyright from those sources:
249 |
250 | /*
251 | * Copyright (c) 2001-2009 Anders Moeller
252 | * All rights reserved.
253 | *
254 | * Redistribution and use in source and binary forms, with or without
255 | * modification, are permitted provided that the following conditions
256 | * are met:
257 | * 1. Redistributions of source code must retain the above copyright
258 | * notice, this list of conditions and the following disclaimer.
259 | * 2. Redistributions in binary form must reproduce the above copyright
260 | * notice, this list of conditions and the following disclaimer in the
261 | * documentation and/or other materials provided with the distribution.
262 | * 3. The name of the author may not be used to endorse or promote products
263 | * derived from this software without specific prior written permission.
264 | *
265 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
266 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
267 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
268 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
269 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
270 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
271 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
272 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
273 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
274 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
275 | */
276 |
277 | The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton
278 | were automatically generated with the moman/finenight FSA package.
279 | Here is the copyright for those sources:
280 |
281 | # Copyright (c) 2010, Jean-Philippe Barrette-LaPierre,
282 | #
283 | # Permission is hereby granted, free of charge, to any person
284 | # obtaining a copy of this software and associated documentation
285 | # files (the "Software"), to deal in the Software without
286 | # restriction, including without limitation the rights to use,
287 | # copy, modify, merge, publish, distribute, sublicense, and/or sell
288 | # copies of the Software, and to permit persons to whom the
289 | # Software is furnished to do so, subject to the following
290 | # conditions:
291 | #
292 | # The above copyright notice and this permission notice shall be
293 | # included in all copies or substantial portions of the Software.
294 | #
295 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
296 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
297 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
298 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
299 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
300 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
301 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
302 | # OTHER DEALINGS IN THE SOFTWARE.
303 |
304 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
305 | derived from ICU (http://www.icu-project.org)
306 | The full license is available here:
307 | http://source.icu-project.org/repos/icu/icu/trunk/license.html
308 |
309 | /*
310 | * Copyright (C) 1999-2010, International Business Machines
311 | * Corporation and others. All Rights Reserved.
312 | *
313 | * Permission is hereby granted, free of charge, to any person obtaining a copy
314 | * of this software and associated documentation files (the "Software"), to deal
315 | * in the Software without restriction, including without limitation the rights
316 | * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
317 | * Software, and to permit persons to whom the Software is furnished to do so,
318 | * provided that the above copyright notice(s) and this permission notice appear
319 | * in all copies of the Software and that both the above copyright notice(s) and
320 | * this permission notice appear in supporting documentation.
321 | *
322 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
323 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
324 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
325 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
326 | * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
327 | * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
328 | * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
329 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
330 | *
331 | * Except as contained in this notice, the name of a copyright holder shall not
332 | * be used in advertising or otherwise to promote the sale, use or other
333 | * dealings in this Software without prior written authorization of the
334 | * copyright holder.
335 | */
336 |
337 | The following license applies to the Snowball stemmers:
338 |
339 | Copyright (c) 2001, Dr Martin Porter
340 | Copyright (c) 2002, Richard Boulton
341 | All rights reserved.
342 |
343 | Redistribution and use in source and binary forms, with or without
344 | modification, are permitted provided that the following conditions are met:
345 |
346 | * Redistributions of source code must retain the above copyright notice,
347 | * this list of conditions and the following disclaimer.
348 | * Redistributions in binary form must reproduce the above copyright
349 | * notice, this list of conditions and the following disclaimer in the
350 | * documentation and/or other materials provided with the distribution.
351 | * Neither the name of the copyright holders nor the names of its contributors
352 | * may be used to endorse or promote products derived from this software
353 | * without specific prior written permission.
354 |
355 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
356 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
357 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
358 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
359 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
360 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
361 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
362 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
363 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
364 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
365 |
366 | The following license applies to the KStemmer:
367 |
368 | Copyright © 2003,
369 | Center for Intelligent Information Retrieval,
370 | University of Massachusetts, Amherst.
371 | All rights reserved.
372 |
373 | Redistribution and use in source and binary forms, with or without modification,
374 | are permitted provided that the following conditions are met:
375 |
376 | 1. Redistributions of source code must retain the above copyright notice, this
377 | list of conditions and the following disclaimer.
378 |
379 | 2. Redistributions in binary form must reproduce the above copyright notice,
380 | this list of conditions and the following disclaimer in the documentation
381 | and/or other materials provided with the distribution.
382 |
383 | 3. The names "Center for Intelligent Information Retrieval" and
384 | "University of Massachusetts" must not be used to endorse or promote products
385 | derived from this software without prior written permission. To obtain
386 | permission, contact info@ciir.cs.umass.edu.
387 |
388 | THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
389 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
390 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
391 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
392 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
393 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
394 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
395 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
396 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
397 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
398 | SUCH DAMAGE.
399 |
400 | The following license applies to the Morfologik project:
401 |
402 | Copyright (c) 2006 Dawid Weiss
403 | Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
404 | All rights reserved.
405 |
406 | Redistribution and use in source and binary forms, with or without modification,
407 | are permitted provided that the following conditions are met:
408 |
409 | * Redistributions of source code must retain the above copyright notice,
410 | this list of conditions and the following disclaimer.
411 |
412 | * Redistributions in binary form must reproduce the above copyright notice,
413 | this list of conditions and the following disclaimer in the documentation
414 | and/or other materials provided with the distribution.
415 |
416 | * Neither the name of Morfologik nor the names of its contributors
417 | may be used to endorse or promote products derived from this software
418 | without specific prior written permission.
419 |
420 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
421 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
422 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
423 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
424 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
425 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
426 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
427 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
428 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
429 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
430 |
431 | ---
432 |
433 | The dictionary comes from Morfologik project. Morfologik uses data from
434 | Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
435 | is licenced on the terms of (inter alia) LGPL and Creative Commons
436 | ShareAlike. The part-of-speech tags were added in Morfologik project and
437 | are not found in the data from sjp.pl. The tagset is similar to IPI PAN
438 | tagset.
439 |
440 | ---
441 |
442 | The following license applies to the Morfeusz project,
443 | used by org.apache.lucene.analysis.morfologik.
444 |
445 | BSD-licensed dictionary of Polish (SGJP)
446 | http://sgjp.pl/morfeusz/
447 |
448 | Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
449 | Marcin Woliński, Robert Wołosz
450 |
451 | All rights reserved.
452 |
453 | Redistribution and use in source and binary forms, with or without
454 | modification, are permitted provided that the following conditions are
455 | met:
456 |
457 | 1. Redistributions of source code must retain the above copyright
458 | notice, this list of conditions and the following disclaimer.
459 |
460 | 2. Redistributions in binary form must reproduce the above copyright
461 | notice, this list of conditions and the following disclaimer in the
462 | documentation and/or other materials provided with the
463 | distribution.
464 |
465 | THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
466 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
467 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
468 | DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
469 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
470 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
471 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
472 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
473 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
474 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
475 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
476 |
--------------------------------------------------------------------------------
|