();
45 | }
46 |
47 | char[] getSegmentBuff() {
48 | return this.segmentBuff;
49 | }
50 |
51 | /**
52 | * 根据context的上下文情况,填充segmentBuff
53 | *
54 | * @param reader
55 | * @return 返回待分析的(有效的)字串长度
56 | * @throws IOException
57 | */
58 | int fillBuffer(Reader reader) throws IOException {
59 | int readCount = 0;
60 | if (this.buffOffset == 0) {
61 | //首次读取reader
62 | readCount = reader.read(segmentBuff);
63 | } else {
64 | int offset = this.available - this.cursor;
65 | if (offset > 0) {
66 | //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
67 | System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
68 | readCount = offset;
69 | }
70 | //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
71 | readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
72 | }
73 | if (readCount < BUFF_SIZE && readCount > 0) {
74 | char[] lastSegmentBuff = new char[readCount];
75 | System.arraycopy(segmentBuff, 0, lastSegmentBuff, 0, readCount);
76 | segmentBuff = lastSegmentBuff;
77 | }
78 | //记录最后一次从Reader中读入的可用字符长度
79 | this.available = readCount;
80 | //重置当前指针
81 | this.cursor = 0;
82 | return readCount;
83 | }
84 |
85 | /**
86 | * 只要buffLocker中存在segmenterName
87 | * 则buffer被锁定
88 | *
89 | * @return boolean 缓冲去是否被锁定
90 | */
91 | boolean isBufferLocked() {
92 | return this.buffLocker.size() > 0;
93 | }
94 |
95 | /**
96 | * 当前segmentBuff是否已经用完
97 | * 当前执针cursor移至segmentBuff末端this.available
98 | *
99 | * @return
100 | */
101 | boolean bufferConsumed() {
102 | return this.cursor == this.available;
103 | }
104 |
105 | /**
106 | * 判断segmentBuff是否需要读取新数据
107 | *
108 | * 满足一下条件时,
109 | * 1.available == BUFF_SIZE 表示buffer满载
110 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
111 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer
112 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作)
113 | *
114 | * @return
115 | */
116 | boolean needRefillBuffer() {
117 | return this.available == BUFF_SIZE
118 | && this.cursor < this.available - 1
119 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL
120 | && !this.isBufferLocked();
121 | }
122 |
123 | /**
124 | * 累计当前的segmentBuff相对于reader起始位置的位移
125 | */
126 | void markBufferOffset() {
127 | this.buffOffset += this.cursor;
128 | }
129 |
130 | /**
131 | * 重置分词上下文状态
132 | */
133 | void reset() {
134 | this.buffLocker.clear();
135 | this.available = 0;
136 | this.buffOffset = 0;
137 | this.charTypes = new int[BUFF_SIZE];
138 | this.cursor = 0;
139 | this.segmentBuff = new char[BUFF_SIZE];
140 | this.results.clear();
141 | }
142 |
143 | /**
144 | * term
145 | *
146 | * 同时处理合并
147 | *
148 | * @return
149 | */
150 | Term getNextTerm() {
151 | //从结果集取出,并移除第一个Lexme
152 | return this.results.pollFirst();
153 | }
154 |
155 | /**
156 | * 添加分词结果到results
157 | */
158 | void addToResults(Term term) {
159 | results.add(term);
160 | }
161 |
162 | }
163 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene4/CharacterUtil.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene4;
2 |
3 | /**
4 | *
5 | * 字符集识别工具类
6 | */
7 | class CharacterUtil {
8 |
9 | public static final int CHAR_USELESS = 0;
10 |
11 | public static final int CHAR_ARABIC = 0X00000001;
12 |
13 | public static final int CHAR_ENGLISH = 0X00000002;
14 |
15 | public static final int CHAR_CHINESE = 0X00000004;
16 |
17 | public static final int CHAR_OTHER_CJK = 0X00000008;
18 |
19 |
20 | /**
21 | * 识别字符类型
22 | * @param input
23 | * @return int CharacterUtil定义的字符类型常量
24 | */
25 | static int identifyCharType(char input){
26 | if(input >= '0' && input <= '9'){
27 | return CHAR_ARABIC;
28 |
29 | }else if((input >= 'a' && input <= 'z')
30 | || (input >= 'A' && input <= 'Z')){
31 | return CHAR_ENGLISH;
32 |
33 | }else {
34 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
35 |
36 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
37 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
38 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
39 | //目前已知的中文字符UTF-8集合
40 | return CHAR_CHINESE;
41 |
42 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
43 | //韩文字符集
44 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
45 | || ub == Character.UnicodeBlock.HANGUL_JAMO
46 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
47 | //日文字符集
48 | || ub == Character.UnicodeBlock.HIRAGANA //平假名
49 | || ub == Character.UnicodeBlock.KATAKANA //片假名
50 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
51 | return CHAR_OTHER_CJK;
52 |
53 | }
54 | }
55 | //其他的不做处理的字符
56 | return CHAR_USELESS;
57 | }
58 |
59 | /**
60 | * 进行字符规格化(全角转半角,大写转小写处理)
61 | * @param input
62 | * @return char
63 | */
64 | static char regularize(char input,boolean lowercase){
65 | if (input == 12288) {
66 | input = (char) 32;
67 |
68 | }else if (input > 65280 && input < 65375) {
69 | input = (char) (input - 65248);
70 |
71 | }else if (input >= 'A' && input <= 'Z' && lowercase) {
72 | input += 32;
73 | }
74 |
75 | return input;
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene4/HanLPAnalyzer.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene4;
2 |
3 | import com.hankcs.hanlp.tokenizer.StandardTokenizer;
4 |
5 | import org.apache.lucene.analysis.Analyzer;
6 | import org.apache.lucene.analysis.Tokenizer;
7 |
8 | import java.io.Reader;
9 | import java.util.Set;
10 |
11 | public class HanLPAnalyzer extends Analyzer
12 | {
13 |
14 | boolean enablePorterStemming;
15 | public Set filter;
16 |
17 | /**
18 | * @param filter 停用词
19 | * @param enablePorterStemming 是否分析词干(仅限英文)
20 | */
21 | public HanLPAnalyzer(Set filter, boolean enablePorterStemming)
22 | {
23 | this.filter = filter;
24 | }
25 |
26 | /**
27 | * @param enablePorterStemming 是否分析词干.进行单复数,时态的转换
28 | */
29 | public HanLPAnalyzer(boolean enablePorterStemming)
30 | {
31 | this.enablePorterStemming = enablePorterStemming;
32 | }
33 |
34 | public HanLPAnalyzer()
35 | {
36 | super();
37 | }
38 |
39 | @Override
40 | protected TokenStreamComponents createComponents(String fieldName) {
41 | Tokenizer tokenizer = new HanLPTokenizer(StandardTokenizer.SEGMENT.enableOffset(true), filter, enablePorterStemming);
42 | return new TokenStreamComponents(tokenizer);
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene4/HanLPIndexAnalyzer.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene4;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import org.apache.lucene.analysis.Analyzer;
5 | import org.apache.lucene.analysis.Tokenizer;
6 | import java.util.Set;
7 |
8 | public class HanLPIndexAnalyzer extends Analyzer
9 | {
10 |
11 | boolean pstemming;
12 | public Set filter;
13 |
14 | /**
15 | * @param filter 停用词
16 | * @param pstemming 是否分析词干
17 | */
18 | public HanLPIndexAnalyzer(Set filter, boolean pstemming)
19 | {
20 | this.filter = filter;
21 | }
22 |
23 | /**
24 | * @param pstemming 是否分析词干.进行单复数,时态的转换
25 | */
26 | public HanLPIndexAnalyzer(boolean pstemming)
27 | {
28 | this.pstemming = pstemming;
29 | }
30 |
31 | public HanLPIndexAnalyzer()
32 | {
33 | super();
34 | }
35 |
36 | @Override
37 | protected TokenStreamComponents createComponents(String fieldName) {
38 | Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enablePlaceRecognize(true).enableCustomDictionary(true).
39 | enableIndexMode(true).enableOrganizationRecognize(true).enablePlaceRecognize(true), filter, pstemming);
40 | return new TokenStreamComponents(tokenizer);
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene4/HanLPTokenizer.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene4;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import com.hankcs.hanlp.seg.Segment;
5 | import com.hankcs.hanlp.seg.common.Term;
6 | import org.apache.lucene.analysis.Tokenizer;
7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
11 |
12 | import java.io.BufferedReader;
13 | import java.io.File;
14 | import java.io.IOException;
15 | import java.util.List;
16 | import java.util.Set;
17 |
18 | public class HanLPTokenizer extends Tokenizer {
19 | //词元文本属性
20 | private final CharTermAttribute termAtt;
21 | //词元位移属性
22 | private final OffsetAttribute offsetAtt;
23 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
24 | private final TypeAttribute typeAtt;
25 | private PositionIncrementAttribute posIncrAtt;
26 | private Set filter;
27 | private boolean enablePorterStemming;
28 | private HanlpSegmenter hanlpSegmenter;
29 | private int position;
30 |
31 | public HanLPTokenizer(Segment segment, Set filter, boolean enablePorterStemming) {
32 | super();
33 | this.filter = filter;
34 | this.enablePorterStemming = enablePorterStemming;
35 | offsetAtt = addAttribute(OffsetAttribute.class);
36 | termAtt = addAttribute(CharTermAttribute.class);
37 | typeAtt = addAttribute(TypeAttribute.class);
38 | posIncrAtt = addAttribute(PositionIncrementAttribute.class);
39 | hanlpSegmenter = new HanlpSegmenter(input, segment);
40 | }
41 |
42 | @Override
43 | final public boolean incrementToken() throws IOException {
44 | clearAttributes();
45 | this.position = 0;
46 | Term term = hanlpSegmenter.next();
47 | if (term != null) {
48 | posIncrAtt.setPositionIncrement(this.position + 1);
49 | termAtt.setEmpty().append(term.word.toLowerCase());
50 | termAtt.setLength(term.word.length());
51 | int length = term.word.length();
52 | offsetAtt.setOffset(term.offset,
53 | term.offset + length);
54 | typeAtt.setType(term.nature.name());
55 | return true;
56 | } else {
57 | return false;
58 | }
59 | }
60 |
61 | /**
62 | * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
63 | */
64 | @Override
65 | public void reset() throws IOException {
66 | super.reset();
67 | this.position = 0;
68 | hanlpSegmenter.reset(new BufferedReader(this.input));
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene4/HanlpSegmenter.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene4;
2 |
3 | import com.hankcs.hanlp.seg.Segment;
4 | import com.hankcs.hanlp.seg.common.Term;
5 |
6 | import java.io.IOException;
7 | import java.io.Reader;
8 | import java.util.List;
9 |
10 | /**
11 | * hanlp分词器主类
12 | */
13 | public final class HanlpSegmenter {
14 |
15 | //字符窜reader
16 | private Reader input;
17 | //分词器上下文
18 | private AnalyzeContext context;
19 | private Segment segment;
20 | private int readNum;
21 |
22 | /**
23 | * 分词器构造函数
24 | *
25 | * @param input
26 | */
27 | public HanlpSegmenter(Reader input, Segment segment) {
28 |
29 | this.input = input;
30 | this.segment = segment;
31 | this.init();
32 | }
33 |
34 |
35 | /**
36 | * 初始化
37 | */
38 | private void init() {
39 | //初始化分词上下文
40 | this.context = new AnalyzeContext();
41 | }
42 |
43 | /**
44 | * 分词,获取下一个词元
45 | *
46 | * @return Lexeme 词元对象
47 | * @throws IOException
48 | */
49 | public synchronized Term next() {
50 | try {
51 | Term term = null;
52 | while ((term = context.getNextTerm()) == null) {
53 | /*
54 | * 从reader中读取数据,填充buffer
55 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理
56 | */
57 | int available = context.fillBuffer(this.input);
58 | if (available <= 0) {
59 | //reader已经读完
60 | context.reset();
61 | return null;
62 |
63 | } else {
64 | List lists = segment.seg(String.valueOf(context.getSegmentBuff()));
65 | for (Term t : lists) {
66 | context.addToResults(t);
67 | }
68 | readNum++;
69 | //字符缓冲区接近读完,需要读入新的字符
70 | if (context.needRefillBuffer()) {
71 | break;
72 | }
73 | }
74 | //移动指针至available
75 | context.bufferConsumed();
76 | //记录本次分词的缓冲区位移
77 | context.markBufferOffset();
78 | }
79 | if (term != null) {
80 | term.offset = term.offset + (AnalyzeContext.BUFF_SIZE * (readNum - 1));
81 | }
82 | return term;
83 | } catch (Exception e) {
84 | e.printStackTrace();
85 | }
86 | return null;
87 | }
88 |
89 | /**
90 | * 重置分词器到初始状态
91 | *
92 | * @param input
93 | */
94 | public synchronized void reset(Reader input) {
95 | this.input = input;
96 | this.readNum = 0;
97 | context.reset();
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene4/PorterStemmer.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene4;
2 |
3 | import org.apache.lucene.util.ArrayUtil;
4 |
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStream;
8 |
9 | import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
10 |
11 | /**
12 | * 抄袭lucene的英文处理
13 | * Stemmer, implementing the Porter Stemming Algorithm
14 | *
15 | * The Stemmer class transforms a word into its root form. The input word can be
16 | * provided a character at time (by calling add()), or at once by calling one of
17 | * the various stem(something) methods.
18 | */
19 |
20 | public class PorterStemmer
21 | {
22 | private char[] b;
23 | private int i, /* offset into b */
24 | j, k, k0;
25 | private boolean dirty = false;
26 | private static final int INITIAL_SIZE = 50;
27 |
28 | public PorterStemmer()
29 | {
30 | b = new char[INITIAL_SIZE];
31 | i = 0;
32 | }
33 |
34 | /**
35 | * reset() resets the stemmer so it can stem another word. If you invoke the
36 | * stemmer by calling add(char) and then stem(), you must call reset()
37 | * before starting another word.
38 | */
39 | public void reset()
40 | {
41 | i = 0;
42 | dirty = false;
43 | }
44 |
45 | /**
46 | * Add a character to the word being stemmed. When you are finished adding
47 | * characters, you can call stem(void) to process the word.
48 | */
49 | public void add(char ch)
50 | {
51 | if (b.length <= i)
52 | {
53 | b = ArrayUtil.grow(b, i + 1);
54 | }
55 | b[i++] = ch;
56 | }
57 |
58 | /**
59 | * After a word has been stemmed, it can be retrieved by toString(), or a
60 | * reference to the internal buffer can be retrieved by getResultBuffer and
61 | * getResultLength (which is generally more efficient.)
62 | */
63 | @Override
64 | public String toString()
65 | {
66 | return new String(b, 0, i);
67 | }
68 |
69 | /**
70 | * Returns the length of the word resulting from the stemming process.
71 | */
72 | public int getResultLength()
73 | {
74 | return i;
75 | }
76 |
77 | /**
78 | * Returns a reference to a character buffer containing the results of the
79 | * stemming process. You also need to consult getResultLength() to determine
80 | * the length of the result.
81 | */
82 | public char[] getResultBuffer()
83 | {
84 | return b;
85 | }
86 |
87 | /* cons(i) is true <=> b[i] is a consonant. */
88 |
89 | private final boolean cons(int i)
90 | {
91 | switch (b[i])
92 | {
93 | case 'a':
94 | case 'e':
95 | case 'i':
96 | case 'o':
97 | case 'u':
98 | return false;
99 | case 'y':
100 | return (i == k0) ? true : !cons(i - 1);
101 | default:
102 | return true;
103 | }
104 | }
105 |
106 | /*
107 | * m() measures the number of consonant sequences between k0 and j. if c is
108 | * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
109 | * presence,
110 | *
111 | * gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3
112 | * ....
113 | */
114 |
115 | private final int m()
116 | {
117 | int n = 0;
118 | int i = k0;
119 | while (true)
120 | {
121 | if (i > j)
122 | return n;
123 | if (!cons(i))
124 | break;
125 | i++;
126 | }
127 | i++;
128 | while (true)
129 | {
130 | while (true)
131 | {
132 | if (i > j)
133 | return n;
134 | if (cons(i))
135 | break;
136 | i++;
137 | }
138 | i++;
139 | n++;
140 | while (true)
141 | {
142 | if (i > j)
143 | return n;
144 | if (!cons(i))
145 | break;
146 | i++;
147 | }
148 | i++;
149 | }
150 | }
151 |
152 | /* vowelinstem() is true <=> k0,...j contains a vowel */
153 |
154 | private final boolean vowelinstem()
155 | {
156 | int i;
157 | for (i = k0; i <= j; i++)
158 | if (!cons(i))
159 | return true;
160 | return false;
161 | }
162 |
163 | /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
164 |
165 | private final boolean doublec(int j)
166 | {
167 | if (j < k0 + 1)
168 | return false;
169 | if (b[j] != b[j - 1])
170 | return false;
171 | return cons(j);
172 | }
173 |
174 | /*
175 | * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
176 | * and also if the second c is not w,x or y. this is used when trying to
177 | * restore an e at the end of a short word. e.g.
178 | *
179 | * cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
180 | */
181 |
182 | private final boolean cvc(int i)
183 | {
184 | if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2))
185 | return false;
186 | else
187 | {
188 | int ch = b[i];
189 | if (ch == 'w' || ch == 'x' || ch == 'y')
190 | return false;
191 | }
192 | return true;
193 | }
194 |
195 | private final boolean ends(String s)
196 | {
197 | int l = s.length();
198 | int o = k - l + 1;
199 | if (o < k0)
200 | return false;
201 | for (int i = 0; i < l; i++)
202 | if (b[o + i] != s.charAt(i))
203 | return false;
204 | j = k - l;
205 | return true;
206 | }
207 |
208 | /*
209 | * setto(s) sets (j+1),...k to the characters in the string s, readjusting
210 | * k.
211 | */
212 |
213 | void setto(String s)
214 | {
215 | int l = s.length();
216 | int o = j + 1;
217 | for (int i = 0; i < l; i++)
218 | b[o + i] = s.charAt(i);
219 | k = j + l;
220 | dirty = true;
221 | }
222 |
223 | /* r(s) is used further down. */
224 |
225 | void r(String s)
226 | {
227 | if (m() > 0)
228 | setto(s);
229 | }
230 |
231 | /*
232 | * step1() gets rid of plurals and -ed or -ing. e.g.
233 | *
234 | * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
235 | *
236 | * feed -> feed agreed -> agree disabled -> disable
237 | *
238 | * matting -> mat mating -> mate meeting -> meet milling -> mill messing ->
239 | * mess
240 | *
241 | * meetings -> meet
242 | */
243 |
244 | private final void step1()
245 | {
246 | if (b[k] == 's')
247 | {
248 | if (ends("sses"))
249 | k -= 2;
250 | else if (ends("ies"))
251 | setto("i");
252 | else if (b[k - 1] != 's')
253 | k--;
254 | }
255 | if (ends("eed"))
256 | {
257 | if (m() > 0)
258 | k--;
259 | }
260 | else if ((ends("ed") || ends("ing")) && vowelinstem())
261 | {
262 | k = j;
263 | if (ends("at"))
264 | setto("ate");
265 | else if (ends("bl"))
266 | setto("ble");
267 | else if (ends("iz"))
268 | setto("ize");
269 | else if (doublec(k))
270 | {
271 | int ch = b[k--];
272 | if (ch == 'l' || ch == 's' || ch == 'z')
273 | k++;
274 | }
275 | else if (m() == 1 && cvc(k))
276 | setto("e");
277 | }
278 | }
279 |
280 | /* step2() turns terminal y to i when there is another vowel in the stem. */
281 |
282 | private final void step2()
283 | {
284 | if (ends("y") && vowelinstem())
285 | {
286 | b[k] = 'i';
287 | dirty = true;
288 | }
289 | }
290 |
291 | /*
292 | * step3() maps double suffices to single ones. so -ization ( = -ize plus
293 | * -ation) maps to -ize etc. note that the string before the suffix must
294 | * give m() > 0.
295 | */
296 |
297 | private final void step3()
298 | {
299 | if (k == k0)
300 | return; /* For Bug 1 */
301 | switch (b[k - 1])
302 | {
303 | case 'a':
304 | if (ends("ational"))
305 | {
306 | r("ate");
307 | break;
308 | }
309 | if (ends("tional"))
310 | {
311 | r("tion");
312 | break;
313 | }
314 | break;
315 | case 'c':
316 | if (ends("enci"))
317 | {
318 | r("ence");
319 | break;
320 | }
321 | if (ends("anci"))
322 | {
323 | r("ance");
324 | break;
325 | }
326 | break;
327 | case 'e':
328 | if (ends("izer"))
329 | {
330 | r("ize");
331 | break;
332 | }
333 | break;
334 | case 'l':
335 | if (ends("bli"))
336 | {
337 | r("ble");
338 | break;
339 | }
340 | if (ends("alli"))
341 | {
342 | r("al");
343 | break;
344 | }
345 | if (ends("entli"))
346 | {
347 | r("ent");
348 | break;
349 | }
350 | if (ends("eli"))
351 | {
352 | r("e");
353 | break;
354 | }
355 | if (ends("ousli"))
356 | {
357 | r("ous");
358 | break;
359 | }
360 | break;
361 | case 'o':
362 | if (ends("ization"))
363 | {
364 | r("ize");
365 | break;
366 | }
367 | if (ends("ation"))
368 | {
369 | r("ate");
370 | break;
371 | }
372 | if (ends("ator"))
373 | {
374 | r("ate");
375 | break;
376 | }
377 | break;
378 | case 's':
379 | if (ends("alism"))
380 | {
381 | r("al");
382 | break;
383 | }
384 | if (ends("iveness"))
385 | {
386 | r("ive");
387 | break;
388 | }
389 | if (ends("fulness"))
390 | {
391 | r("ful");
392 | break;
393 | }
394 | if (ends("ousness"))
395 | {
396 | r("ous");
397 | break;
398 | }
399 | break;
400 | case 't':
401 | if (ends("aliti"))
402 | {
403 | r("al");
404 | break;
405 | }
406 | if (ends("iviti"))
407 | {
408 | r("ive");
409 | break;
410 | }
411 | if (ends("biliti"))
412 | {
413 | r("ble");
414 | break;
415 | }
416 | break;
417 | case 'g':
418 | if (ends("logi"))
419 | {
420 | r("log");
421 | break;
422 | }
423 | }
424 | }
425 |
426 | /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
427 |
428 | private final void step4()
429 | {
430 | switch (b[k])
431 | {
432 | case 'e':
433 | if (ends("icate"))
434 | {
435 | r("ic");
436 | break;
437 | }
438 | if (ends("ative"))
439 | {
440 | r("");
441 | break;
442 | }
443 | if (ends("alize"))
444 | {
445 | r("al");
446 | break;
447 | }
448 | break;
449 | case 'i':
450 | if (ends("iciti"))
451 | {
452 | r("ic");
453 | break;
454 | }
455 | break;
456 | case 'l':
457 | if (ends("ical"))
458 | {
459 | r("ic");
460 | break;
461 | }
462 | if (ends("ful"))
463 | {
464 | r("");
465 | break;
466 | }
467 | break;
468 | case 's':
469 | if (ends("ness"))
470 | {
471 | r("");
472 | break;
473 | }
474 | break;
475 | }
476 | }
477 |
478 | /* step5() takes off -ant, -ence etc., in context vcvc. */
479 |
480 | private final void step5()
481 | {
482 | if (k == k0)
483 | return; /* for Bug 1 */
484 | switch (b[k - 1])
485 | {
486 | case 'a':
487 | if (ends("al"))
488 | break;
489 | return;
490 | case 'c':
491 | if (ends("ance"))
492 | break;
493 | if (ends("ence"))
494 | break;
495 | return;
496 | case 'e':
497 | if (ends("er"))
498 | break;
499 | return;
500 | case 'i':
501 | if (ends("ic"))
502 | break;
503 | return;
504 | case 'l':
505 | if (ends("able"))
506 | break;
507 | if (ends("ible"))
508 | break;
509 | return;
510 | case 'n':
511 | if (ends("ant"))
512 | break;
513 | if (ends("ement"))
514 | break;
515 | if (ends("ment"))
516 | break;
517 | /* element etc. not stripped before the m */
518 | if (ends("ent"))
519 | break;
520 | return;
521 | case 'o':
522 | if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't'))
523 | break;
524 | /* j >= 0 fixes Bug 2 */
525 | if (ends("ou"))
526 | break;
527 | return;
528 | /* takes care of -ous */
529 | case 's':
530 | if (ends("ism"))
531 | break;
532 | return;
533 | case 't':
534 | if (ends("ate"))
535 | break;
536 | if (ends("iti"))
537 | break;
538 | return;
539 | case 'u':
540 | if (ends("ous"))
541 | break;
542 | return;
543 | case 'v':
544 | if (ends("ive"))
545 | break;
546 | return;
547 | case 'z':
548 | if (ends("ize"))
549 | break;
550 | return;
551 | default:
552 | return;
553 | }
554 | if (m() > 1)
555 | k = j;
556 | }
557 |
558 | /* step6() removes a final -e if m() > 1. */
559 |
560 | private final void step6()
561 | {
562 | j = k;
563 | if (b[k] == 'e')
564 | {
565 | int a = m();
566 | if (a > 1 || a == 1 && !cvc(k - 1))
567 | k--;
568 | }
569 | if (b[k] == 'l' && doublec(k) && m() > 1)
570 | k--;
571 | }
572 |
573 | /**
574 | * Stem a word contained in a portion of a char[] array. Returns true if the
575 | * stemming process resulted in a word different from the input. You can
576 | * retrieve the result with getResultLength()/getResultBuffer() or
577 | * toString().
578 | */
579 | public boolean stem(char[] wordBuffer, int offset, int wordLen)
580 | {
581 | reset();
582 | if (b.length < wordLen)
583 | {
584 | b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
585 | }
586 | System.arraycopy(wordBuffer, offset, b, 0, wordLen);
587 | i = wordLen;
588 | return stem(0);
589 | }
590 |
591 | /**
592 | * Stem the word placed into the Stemmer buffer through calls to add().
593 | * Returns true if the stemming process resulted in a word different from
594 | * the input. You can retrieve the result with
595 | * getResultLength()/getResultBuffer() or toString().
596 | */
597 | public boolean stem()
598 | {
599 | return stem(0);
600 | }
601 |
602 | public boolean stem(int i0)
603 | {
604 | k = i - 1;
605 | k0 = i0;
606 | if (k > k0 + 1)
607 | {
608 | step1();
609 | step2();
610 | step3();
611 | step4();
612 | step5();
613 | step6();
614 | }
615 | // Also, a word is considered dirty if we lopped off letters
616 | // Thanks to Ifigenia Vairelles for pointing this out.
617 | if (i != k + 1)
618 | dirty = true;
619 | i = k + 1;
620 | return dirty;
621 | }
622 |
623 | }
624 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/HanLPAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import com.hankcs.hanlp.utility.Predefine;
4 | import com.hankcs.lucene4.HanLPIndexAnalyzer;
5 | import org.elasticsearch.common.inject.Inject;
6 | import org.elasticsearch.common.inject.assistedinject.Assisted;
7 | import org.elasticsearch.common.settings.Settings;
8 | import org.elasticsearch.env.Environment;
9 | import org.elasticsearch.index.IndexSettings;
10 | import java.io.File;
11 | import java.nio.file.Path;
12 |
13 | /**
14 | */
15 | public class HanLPAnalyzerProvider extends AbstractIndexAnalyzerProvider {
16 |
17 | private final HanLPIndexAnalyzer analyzer;
18 |
19 | @Inject
20 | public HanLPAnalyzerProvider(IndexSettings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
21 | super(indexSettings, name, settings);
22 | Path pluginsDir = env.pluginsFile();
23 | Predefine.HANLP_PROPERTIES_PATH = pluginsDir.toString() + File.separator + "analysis-hanlp" + File.separator + "hanlp.properties";
24 | analyzer = new HanLPIndexAnalyzer(true);
25 | }
26 |
27 | public static HanLPAnalyzerProvider getIndexAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
28 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings);
29 | }
30 |
31 | public static HanLPAnalyzerProvider getSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
32 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings);
33 | }
34 |
35 | @Override
36 | public HanLPIndexAnalyzer get() {
37 | return this.analyzer;
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/HanLPTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import com.hankcs.hanlp.utility.Predefine;
4 | import com.hankcs.lucene4.HanLPIndexAnalyzer;
5 | import org.elasticsearch.common.inject.Inject;
6 | import org.elasticsearch.common.inject.assistedinject.Assisted;
7 | import org.elasticsearch.common.settings.Settings;
8 | import org.elasticsearch.env.Environment;
9 | import org.elasticsearch.index.IndexSettings;
10 | import java.io.File;
11 | import java.nio.file.Path;
12 |
13 | /**
14 | */
15 | public class HanLPTokenizerFactory extends AbstractTokenizerFactory {
16 |
17 | private boolean enablePorterStemming;
18 | private boolean enableIndexMode;
19 |
20 | public HanLPTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
21 | super(indexSettings, name, settings);
22 | Path pluginsDir = env.pluginsFile();
23 | Predefine.HANLP_PROPERTIES_PATH = pluginsDir.toString() + File.separator + "analysis-hanlp" + File.separator + "hanlp.properties";
24 | enablePorterStemming = settings.getAsBoolean("enablePorterStemming", false);
25 | }
26 |
27 | public static HanLPTokenizerFactory getIndexTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
28 | return new HanLPTokenizerFactory(indexSettings, env, name, settings).setIndexMode(true);
29 | }
30 |
31 | public static HanLPTokenizerFactory getSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
32 | return new HanLPTokenizerFactory(indexSettings, env, name, settings).setIndexMode(false);
33 | }
34 |
35 | private HanLPTokenizerFactory setIndexMode(boolean enableIndexMode) {
36 | this.enableIndexMode = enableIndexMode;
37 | return this;
38 | }
39 |
40 | @Override
41 | public Tokenizer create() {
42 | return new HanLPTokenizer(HanLP.newSegment().enablePlaceRecognize(true).enableCustomDictionary(true).enableIndexMode(enableIndexMode).enableOffset(true), null, enablePorterStemming);
43 | }
44 |
45 | }
46 |
47 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/hanlp/AnalysisHanlpPlugin.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.analysis.hanlp;
2 |
3 | import org.apache.lucene.analysis.Analyzer;
4 | import org.elasticsearch.index.analysis.*;
5 | import org.elasticsearch.indices.analysis.AnalysisModule;
6 | import org.elasticsearch.plugins.AnalysisPlugin;
7 | import org.elasticsearch.plugins.Plugin;
8 |
9 | import java.util.HashMap;
10 | import java.util.Map;
11 |
12 | /**
13 | * The HanLP Analysis HanLP module into elasticsearch.
14 | */
15 | public class AnalysisHanlpPlugin extends Plugin implements AnalysisPlugin {
16 |
17 | public static String PLUGIN_NAME = "analysis-hanlp";
18 |
19 | @Override
20 | public Map> getTokenizers() {
21 | Map> extra = new HashMap<>();
22 |
23 | extra.put("hanlp-index", HanLPTokenizerFactory::getIndexTokenizerFactory);
24 | extra.put("hanlp-smart", HanLPTokenizerFactory::getSmartTokenizerFactory);
25 | extra.put("hanlp", HanLPTokenizerFactory::getIndexTokenizerFactory);
26 | return extra;
27 | }
28 |
29 | @Override
30 | public Map>> getAnalyzers() {
31 | Map>> extra = new HashMap<>();
32 |
33 | extra.put("hanlp-index", HanLPAnalyzerProvider::getIndexAnalyzerProvider);
34 | extra.put("hanlp-smart", HanLPAnalyzerProvider::getSmartAnalyzerProvider);
35 | extra.put("hanlp", HanLPAnalyzerProvider::getIndexAnalyzerProvider);
36 | return extra;
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/resources/hanlp.properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kepmov/elasticsearch-analysis-hanlp/04f35320e2dec54c2c3a0c2e6758864797ea8cfc/src/main/resources/hanlp.properties
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | # Elasticsearch plugin descriptor file
2 | # This file must exist as 'plugin-descriptor.properties' at
3 | # the root directory of all plugins.
4 | #
5 | # A plugin can be 'site', 'jvm', or both.
6 | #
7 | ### example site plugin for "foo":
8 | #
9 | # foo.zip <-- zip file for the plugin, with this structure:
10 | # _site/ <-- the contents that will be served
11 | # plugin-descriptor.properties <-- example contents below:
12 | #
13 | # site=true
14 | # description=My cool plugin
15 | # version=1.0
16 | #
17 | ### example jvm plugin for "foo"
18 | #
19 | # foo.zip <-- zip file for the plugin, with this structure:
20 | # .jar <-- classes, resources, dependencies
21 | # .jar <-- any number of jars
22 | # plugin-descriptor.properties <-- example contents below:
23 | #
24 | # jvm=true
25 | # classname=foo.bar.BazPlugin
26 | # description=My cool plugin
27 | # version=2.0.0-rc1
28 | # elasticsearch.version=5.0
29 | # java.version=1.8
30 | #
31 | ### mandatory elements for all plugins:
32 | #
33 | # 'description': simple summary of the plugin
34 | description=${project.description}
35 | #
36 | # 'version': plugin's version
37 | version=${project.version}
38 | #
39 | # 'name': the plugin name
40 | name=${elasticsearch.plugin.name}
41 |
42 | ### mandatory elements for site plugins:
43 | #
44 | # 'site': set to true to indicate contents of the _site/
45 | # directory in the root of the plugin should be served.
46 | site=${elasticsearch.plugin.site}
47 | #
48 | ### mandatory elements for jvm plugins :
49 | #
50 | # 'jvm': true if the 'classname' class should be loaded
51 | # from jar files in the root directory of the plugin.
52 | # Note that only jar files in the root directory are
53 | # added to the classpath for the plugin! If you need
54 | # other resources, package them into a resources jar.
55 | jvm=${elasticsearch.plugin.jvm}
56 | #
57 | # 'classname': the name of the class to load, fully-qualified.
58 | classname=${elasticsearch.plugin.classname}
59 | #
60 | # 'java.version' version of java the code is built against
61 | # use the system property java.specification.version
62 | # version string must be a sequence of nonnegative decimal integers
63 | # separated by "."'s and may have leading zeros
64 | java.version=${maven.compiler.target}
65 | #
66 | # 'elasticsearch.version' version of elasticsearch compiled against
67 | # You will have to release a new version of the plugin for each new
68 | # elasticsearch release. This version is checked when the plugin
69 | # is loaded so Elasticsearch will refuse to start in the presence of
70 | # plugins with the incorrect elasticsearch.version.
71 | elasticsearch.version=${elasticsearch.version}
72 | #
73 | ### deprecated elements for jvm plugins :
74 | #
75 | # 'isolated': true if the plugin should have its own classloader.
76 | # passing false is deprecated, and only intended to support plugins
77 | # that have hard dependencies against each other. If this is
78 | # not specified, then the plugin is isolated by default.
79 | isolated=${elasticsearch.plugin.isolated}
80 | #
--------------------------------------------------------------------------------