├── README.md ├── lib └── hanlp-1.2.8.jar ├── pom.xml └── src └── main ├── assemblies └── plugin.xml ├── java ├── com │ └── hankcs │ │ └── lucene4 │ │ ├── AnalyzeContext.java │ │ ├── CharacterUtil.java │ │ ├── HanLPAnalyzer.java │ │ ├── HanLPIndexAnalyzer.java │ │ ├── HanLPTokenizer.java │ │ ├── HanlpSegmenter.java │ │ └── PorterStemmer.java └── org │ └── elasticsearch │ ├── index │ └── analysis │ │ ├── HanLPAnalyzerProvider.java │ │ └── HanLPTokenizerFactory.java │ └── plugin │ └── analysis │ └── hanlp │ └── AnalysisHanlpPlugin.java └── resources ├── hanlp.properties └── plugin-descriptor.properties /README.md: -------------------------------------------------------------------------------- 1 | # elasticsearch-analysis-hanlp 2 | 3 | 安装步骤: 4 | 5 | 1、下载插件并解压到es的plugins目录下 6 | 7 | 修改analysis-hanlp目录下的hanlp.properties文件,修改root的属性,值为analysis-hanlp下的data 目录的地址 8 | 9 | 修改analysis-hanlp目录下的plugin-descriptor.properties文件, 10 | 修改elasticsearch的版本为你当前的版本elasticsearch.version=你的es版本号(like:5.5.1) 11 | 12 | 2、修改es config目录下的jvm.options文件,最后一行添加 13 | 14 | -Djava.security.policy=/home/es/elasticsearch-7.0.1/plugins/analysis-hanlp/plugin-security.policy(地址换成你自己的地址) 15 | 16 | 重启es 17 | 18 | GET /_analyze?analyzer=hanlp-index&pretty=true 19 | 20 | { 21 | 22 | "text":"张柏芝士蛋糕店" 23 | 24 | } 25 | 26 | 27 | 测试是否安装成功 28 | 29 | 30 | analyzer有hanlp_max_word(索引模式)和hanlp_smart(智能模式) 31 | 32 | hanlp_max_word:尽可能的切分多的结果 33 | 34 | hanlp_smart:切分少的词 35 | 36 | 自定义词典: 37 | 38 | 修改plugins/analysis-hanlp/data/dictionary/custom下的 我的词典.txt文件 39 | 40 | 格式遵从[单词] [词性A] [A的频次] 41 | 42 | 修改完后删除同目录下的CustomDictionary.txt.bin文件 43 | 44 | 重启es服务 45 | 46 | 首次启动hanlp会生成相应的文件,首次启动可能会慢一点 47 | -------------------------------------------------------------------------------- /lib/hanlp-1.2.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kepmov/elasticsearch-analysis-hanlp/04f35320e2dec54c2c3a0c2e6758864797ea8cfc/lib/hanlp-1.2.8.jar -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | org.elasticsearch 6 | elasticsearch-analysis-hanlp 7 | 5.1.1 8 | jar 9 | 10 | elasticsearch-analysis-hanlp 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 5.1.1 16 | 1.8 17 | ${project.basedir}/src/main/assemblies/plugin.xml 18 | analysis-hanlp 19 | org.elasticsearch.plugin.analysis.hanlp.AnalysisHanlpPlugin 20 | true 21 | false 22 | true 23 | 4E899B30 24 | true 25 | 26 | 27 | 28 | oss.sonatype.org 29 | OSS Sonatype 30 | 31 | true 32 | 33 | 34 | true 35 | 36 | http://oss.sonatype.org/content/repositories/releases/ 37 | 38 | 39 | 40 | 41 | org.elasticsearch 42 | elasticsearch 43 | ${elasticsearch.version} 44 | compile 45 | 46 | 47 | log4j 48 | log4j 49 | 1.2.16 50 | runtime 51 | 52 | 53 | com.HanLP 54 | hanlp 55 | 1.2.8 56 | ${pom.basedir}/lib/hanlp-1.2.8.jar 57 | system 58 | 59 | 60 | junit 61 | junit 62 | 4.11 63 | test 64 | 65 | 66 | org.junit.jupiter 67 | junit-jupiter-api 68 | RELEASE 69 | 70 | 71 | org.apache.lucene 72 | lucene-analyzers-common 73 | 6.4.0 74 | 75 | 76 | 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-compiler-plugin 81 | 2.3.2 82 | 83 | 1.8 84 | 1.8 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-surefire-plugin 90 | 2.11 91 | 92 | 93 | org.apache.maven.plugins 94 | maven-source-plugin 95 | 2.1.2 96 | 97 | 98 | attach-sources 99 | 100 | jar 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | analysis-hanlp-release 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/config 11 | /config 12 | 13 | 14 | 15 | 16 | 17 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 18 | 19 | true 20 | 21 | 22 | ${project.basedir}/src/main/resources/hanlp.properties 23 | 24 | true 25 | 26 | 27 | 28 | 29 | / 30 | true 31 | true 32 | 33 | org.elasticsearch:elasticsearch 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene4/AnalyzeContext.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene4; 2 | 3 | import com.hankcs.hanlp.seg.common.Term; 4 | 5 | import java.io.IOException; 6 | import java.io.Reader; 7 | import java.util.*; 8 | 9 | /** 10 | * 分词器上下文状态 11 | */ 12 | class AnalyzeContext { 13 | 14 | //默认缓冲区大小 15 | public static final int BUFF_SIZE = 4096; 16 | //缓冲区耗尽的临界值 17 | private static final int BUFF_EXHAUST_CRITICAL = 100; 18 | 19 | 20 | //字符窜读取缓冲 21 | private char[] segmentBuff; 22 | //字符类型数组 23 | private int[] charTypes; 24 | 25 | //记录Reader内已分析的字串总长度 26 | //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 27 | private int buffOffset; 28 | //当前缓冲区位置指针 29 | private int cursor; 30 | //最近一次读入的,可处理的字串长度 31 | private int available; 32 | 33 | //最终分词结果集 34 | private LinkedList results; 35 | 36 | //子分词器锁 37 | //该集合非空,说明有子分词器在占用segmentBuff 38 | private Set buffLocker; 39 | 40 | public AnalyzeContext() { 41 | this.segmentBuff = new char[BUFF_SIZE]; 42 | this.charTypes = new int[BUFF_SIZE]; 43 | this.buffLocker = new HashSet(); 44 | this.results = new LinkedList(); 45 | } 46 | 47 | char[] getSegmentBuff() { 48 | return this.segmentBuff; 49 | } 50 | 51 | /** 52 | * 根据context的上下文情况,填充segmentBuff 53 | * 54 | * @param reader 55 | * @return 返回待分析的(有效的)字串长度 56 | * @throws IOException 57 | */ 58 | int fillBuffer(Reader reader) throws IOException { 59 | int readCount = 0; 60 | if (this.buffOffset == 0) { 61 | //首次读取reader 62 | readCount = reader.read(segmentBuff); 63 | } else { 64 | int offset = this.available - this.cursor; 65 | if (offset > 0) { 66 | //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部 67 | System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset); 68 | readCount = offset; 69 | } 70 | //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分 71 | readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset); 72 | } 73 | if (readCount < BUFF_SIZE && readCount > 0) { 74 | char[] lastSegmentBuff = new char[readCount]; 75 | System.arraycopy(segmentBuff, 0, lastSegmentBuff, 0, readCount); 76 | segmentBuff = lastSegmentBuff; 77 | } 78 | //记录最后一次从Reader中读入的可用字符长度 79 | this.available = readCount; 80 | //重置当前指针 81 | this.cursor = 0; 82 | return readCount; 83 | } 84 | 85 | /** 86 | * 只要buffLocker中存在segmenterName 87 | * 则buffer被锁定 88 | * 89 | * @return boolean 缓冲去是否被锁定 90 | */ 91 | boolean isBufferLocked() { 92 | return this.buffLocker.size() > 0; 93 | } 94 | 95 | /** 96 | * 当前segmentBuff是否已经用完 97 | * 当前执针cursor移至segmentBuff末端this.available 98 | * 99 | * @return 100 | */ 101 | boolean bufferConsumed() { 102 | return this.cursor == this.available; 103 | } 104 | 105 | /** 106 | * 判断segmentBuff是否需要读取新数据 107 | *

108 | * 满足一下条件时, 109 | * 1.available == BUFF_SIZE 表示buffer满载 110 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 111 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer 112 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作) 113 | * 114 | * @return 115 | */ 116 | boolean needRefillBuffer() { 117 | return this.available == BUFF_SIZE 118 | && this.cursor < this.available - 1 119 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL 120 | && !this.isBufferLocked(); 121 | } 122 | 123 | /** 124 | * 累计当前的segmentBuff相对于reader起始位置的位移 125 | */ 126 | void markBufferOffset() { 127 | this.buffOffset += this.cursor; 128 | } 129 | 130 | /** 131 | * 重置分词上下文状态 132 | */ 133 | void reset() { 134 | this.buffLocker.clear(); 135 | this.available = 0; 136 | this.buffOffset = 0; 137 | this.charTypes = new int[BUFF_SIZE]; 138 | this.cursor = 0; 139 | this.segmentBuff = new char[BUFF_SIZE]; 140 | this.results.clear(); 141 | } 142 | 143 | /** 144 | * term 145 | *

146 | * 同时处理合并 147 | * 148 | * @return 149 | */ 150 | Term getNextTerm() { 151 | //从结果集取出,并移除第一个Lexme 152 | return this.results.pollFirst(); 153 | } 154 | 155 | /** 156 | * 添加分词结果到results 157 | */ 158 | void addToResults(Term term) { 159 | results.add(term); 160 | } 161 | 162 | } 163 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene4/CharacterUtil.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene4; 2 | 3 | /** 4 | * 5 | * 字符集识别工具类 6 | */ 7 | class CharacterUtil { 8 | 9 | public static final int CHAR_USELESS = 0; 10 | 11 | public static final int CHAR_ARABIC = 0X00000001; 12 | 13 | public static final int CHAR_ENGLISH = 0X00000002; 14 | 15 | public static final int CHAR_CHINESE = 0X00000004; 16 | 17 | public static final int CHAR_OTHER_CJK = 0X00000008; 18 | 19 | 20 | /** 21 | * 识别字符类型 22 | * @param input 23 | * @return int CharacterUtil定义的字符类型常量 24 | */ 25 | static int identifyCharType(char input){ 26 | if(input >= '0' && input <= '9'){ 27 | return CHAR_ARABIC; 28 | 29 | }else if((input >= 'a' && input <= 'z') 30 | || (input >= 'A' && input <= 'Z')){ 31 | return CHAR_ENGLISH; 32 | 33 | }else { 34 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); 35 | 36 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 37 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 38 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ 39 | //目前已知的中文字符UTF-8集合 40 | return CHAR_CHINESE; 41 | 42 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符 43 | //韩文字符集 44 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES 45 | || ub == Character.UnicodeBlock.HANGUL_JAMO 46 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 47 | //日文字符集 48 | || ub == Character.UnicodeBlock.HIRAGANA //平假名 49 | || ub == Character.UnicodeBlock.KATAKANA //片假名 50 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){ 51 | return CHAR_OTHER_CJK; 52 | 53 | } 54 | } 55 | //其他的不做处理的字符 56 | return CHAR_USELESS; 57 | } 58 | 59 | /** 60 | * 进行字符规格化(全角转半角,大写转小写处理) 61 | * @param input 62 | * @return char 63 | */ 64 | static char regularize(char input,boolean lowercase){ 65 | if (input == 12288) { 66 | input = (char) 32; 67 | 68 | }else if (input > 65280 && input < 65375) { 69 | input = (char) (input - 65248); 70 | 71 | }else if (input >= 'A' && input <= 'Z' && lowercase) { 72 | input += 32; 73 | } 74 | 75 | return input; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene4/HanLPAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene4; 2 | 3 | import com.hankcs.hanlp.tokenizer.StandardTokenizer; 4 | 5 | import org.apache.lucene.analysis.Analyzer; 6 | import org.apache.lucene.analysis.Tokenizer; 7 | 8 | import java.io.Reader; 9 | import java.util.Set; 10 | 11 | public class HanLPAnalyzer extends Analyzer 12 | { 13 | 14 | boolean enablePorterStemming; 15 | public Set filter; 16 | 17 | /** 18 | * @param filter 停用词 19 | * @param enablePorterStemming 是否分析词干(仅限英文) 20 | */ 21 | public HanLPAnalyzer(Set filter, boolean enablePorterStemming) 22 | { 23 | this.filter = filter; 24 | } 25 | 26 | /** 27 | * @param enablePorterStemming 是否分析词干.进行单复数,时态的转换 28 | */ 29 | public HanLPAnalyzer(boolean enablePorterStemming) 30 | { 31 | this.enablePorterStemming = enablePorterStemming; 32 | } 33 | 34 | public HanLPAnalyzer() 35 | { 36 | super(); 37 | } 38 | 39 | @Override 40 | protected TokenStreamComponents createComponents(String fieldName) { 41 | Tokenizer tokenizer = new HanLPTokenizer(StandardTokenizer.SEGMENT.enableOffset(true), filter, enablePorterStemming); 42 | return new TokenStreamComponents(tokenizer); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene4/HanLPIndexAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene4; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import org.apache.lucene.analysis.Analyzer; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | import java.util.Set; 7 | 8 | public class HanLPIndexAnalyzer extends Analyzer 9 | { 10 | 11 | boolean pstemming; 12 | public Set filter; 13 | 14 | /** 15 | * @param filter 停用词 16 | * @param pstemming 是否分析词干 17 | */ 18 | public HanLPIndexAnalyzer(Set filter, boolean pstemming) 19 | { 20 | this.filter = filter; 21 | } 22 | 23 | /** 24 | * @param pstemming 是否分析词干.进行单复数,时态的转换 25 | */ 26 | public HanLPIndexAnalyzer(boolean pstemming) 27 | { 28 | this.pstemming = pstemming; 29 | } 30 | 31 | public HanLPIndexAnalyzer() 32 | { 33 | super(); 34 | } 35 | 36 | @Override 37 | protected TokenStreamComponents createComponents(String fieldName) { 38 | Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enablePlaceRecognize(true).enableCustomDictionary(true). 39 | enableIndexMode(true).enableOrganizationRecognize(true).enablePlaceRecognize(true), filter, pstemming); 40 | return new TokenStreamComponents(tokenizer); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene4/HanLPTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene4; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import com.hankcs.hanlp.seg.Segment; 5 | import com.hankcs.hanlp.seg.common.Term; 6 | import org.apache.lucene.analysis.Tokenizer; 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 11 | 12 | import java.io.BufferedReader; 13 | import java.io.File; 14 | import java.io.IOException; 15 | import java.util.List; 16 | import java.util.Set; 17 | 18 | public class HanLPTokenizer extends Tokenizer { 19 | //词元文本属性 20 | private final CharTermAttribute termAtt; 21 | //词元位移属性 22 | private final OffsetAttribute offsetAtt; 23 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) 24 | private final TypeAttribute typeAtt; 25 | private PositionIncrementAttribute posIncrAtt; 26 | private Set filter; 27 | private boolean enablePorterStemming; 28 | private HanlpSegmenter hanlpSegmenter; 29 | private int position; 30 | 31 | public HanLPTokenizer(Segment segment, Set filter, boolean enablePorterStemming) { 32 | super(); 33 | this.filter = filter; 34 | this.enablePorterStemming = enablePorterStemming; 35 | offsetAtt = addAttribute(OffsetAttribute.class); 36 | termAtt = addAttribute(CharTermAttribute.class); 37 | typeAtt = addAttribute(TypeAttribute.class); 38 | posIncrAtt = addAttribute(PositionIncrementAttribute.class); 39 | hanlpSegmenter = new HanlpSegmenter(input, segment); 40 | } 41 | 42 | @Override 43 | final public boolean incrementToken() throws IOException { 44 | clearAttributes(); 45 | this.position = 0; 46 | Term term = hanlpSegmenter.next(); 47 | if (term != null) { 48 | posIncrAtt.setPositionIncrement(this.position + 1); 49 | termAtt.setEmpty().append(term.word.toLowerCase()); 50 | termAtt.setLength(term.word.length()); 51 | int length = term.word.length(); 52 | offsetAtt.setOffset(term.offset, 53 | term.offset + length); 54 | typeAtt.setType(term.nature.name()); 55 | return true; 56 | } else { 57 | return false; 58 | } 59 | } 60 | 61 | /** 62 | * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败 63 | */ 64 | @Override 65 | public void reset() throws IOException { 66 | super.reset(); 67 | this.position = 0; 68 | hanlpSegmenter.reset(new BufferedReader(this.input)); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene4/HanlpSegmenter.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene4; 2 | 3 | import com.hankcs.hanlp.seg.Segment; 4 | import com.hankcs.hanlp.seg.common.Term; 5 | 6 | import java.io.IOException; 7 | import java.io.Reader; 8 | import java.util.List; 9 | 10 | /** 11 | * hanlp分词器主类 12 | */ 13 | public final class HanlpSegmenter { 14 | 15 | //字符窜reader 16 | private Reader input; 17 | //分词器上下文 18 | private AnalyzeContext context; 19 | private Segment segment; 20 | private int readNum; 21 | 22 | /** 23 | * 分词器构造函数 24 | * 25 | * @param input 26 | */ 27 | public HanlpSegmenter(Reader input, Segment segment) { 28 | 29 | this.input = input; 30 | this.segment = segment; 31 | this.init(); 32 | } 33 | 34 | 35 | /** 36 | * 初始化 37 | */ 38 | private void init() { 39 | //初始化分词上下文 40 | this.context = new AnalyzeContext(); 41 | } 42 | 43 | /** 44 | * 分词,获取下一个词元 45 | * 46 | * @return Lexeme 词元对象 47 | * @throws IOException 48 | */ 49 | public synchronized Term next() { 50 | try { 51 | Term term = null; 52 | while ((term = context.getNextTerm()) == null) { 53 | /* 54 | * 从reader中读取数据,填充buffer 55 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理 56 | */ 57 | int available = context.fillBuffer(this.input); 58 | if (available <= 0) { 59 | //reader已经读完 60 | context.reset(); 61 | return null; 62 | 63 | } else { 64 | List lists = segment.seg(String.valueOf(context.getSegmentBuff())); 65 | for (Term t : lists) { 66 | context.addToResults(t); 67 | } 68 | readNum++; 69 | //字符缓冲区接近读完,需要读入新的字符 70 | if (context.needRefillBuffer()) { 71 | break; 72 | } 73 | } 74 | //移动指针至available 75 | context.bufferConsumed(); 76 | //记录本次分词的缓冲区位移 77 | context.markBufferOffset(); 78 | } 79 | if (term != null) { 80 | term.offset = term.offset + (AnalyzeContext.BUFF_SIZE * (readNum - 1)); 81 | } 82 | return term; 83 | } catch (Exception e) { 84 | e.printStackTrace(); 85 | } 86 | return null; 87 | } 88 | 89 | /** 90 | * 重置分词器到初始状态 91 | * 92 | * @param input 93 | */ 94 | public synchronized void reset(Reader input) { 95 | this.input = input; 96 | this.readNum = 0; 97 | context.reset(); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene4/PorterStemmer.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene4; 2 | 3 | import org.apache.lucene.util.ArrayUtil; 4 | 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | 9 | import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR; 10 | 11 | /** 12 | * 抄袭lucene的英文处理 13 | * Stemmer, implementing the Porter Stemming Algorithm 14 | *

15 | * The Stemmer class transforms a word into its root form. The input word can be 16 | * provided a character at time (by calling add()), or at once by calling one of 17 | * the various stem(something) methods. 18 | */ 19 | 20 | public class PorterStemmer 21 | { 22 | private char[] b; 23 | private int i, /* offset into b */ 24 | j, k, k0; 25 | private boolean dirty = false; 26 | private static final int INITIAL_SIZE = 50; 27 | 28 | public PorterStemmer() 29 | { 30 | b = new char[INITIAL_SIZE]; 31 | i = 0; 32 | } 33 | 34 | /** 35 | * reset() resets the stemmer so it can stem another word. If you invoke the 36 | * stemmer by calling add(char) and then stem(), you must call reset() 37 | * before starting another word. 38 | */ 39 | public void reset() 40 | { 41 | i = 0; 42 | dirty = false; 43 | } 44 | 45 | /** 46 | * Add a character to the word being stemmed. When you are finished adding 47 | * characters, you can call stem(void) to process the word. 48 | */ 49 | public void add(char ch) 50 | { 51 | if (b.length <= i) 52 | { 53 | b = ArrayUtil.grow(b, i + 1); 54 | } 55 | b[i++] = ch; 56 | } 57 | 58 | /** 59 | * After a word has been stemmed, it can be retrieved by toString(), or a 60 | * reference to the internal buffer can be retrieved by getResultBuffer and 61 | * getResultLength (which is generally more efficient.) 62 | */ 63 | @Override 64 | public String toString() 65 | { 66 | return new String(b, 0, i); 67 | } 68 | 69 | /** 70 | * Returns the length of the word resulting from the stemming process. 71 | */ 72 | public int getResultLength() 73 | { 74 | return i; 75 | } 76 | 77 | /** 78 | * Returns a reference to a character buffer containing the results of the 79 | * stemming process. You also need to consult getResultLength() to determine 80 | * the length of the result. 81 | */ 82 | public char[] getResultBuffer() 83 | { 84 | return b; 85 | } 86 | 87 | /* cons(i) is true <=> b[i] is a consonant. */ 88 | 89 | private final boolean cons(int i) 90 | { 91 | switch (b[i]) 92 | { 93 | case 'a': 94 | case 'e': 95 | case 'i': 96 | case 'o': 97 | case 'u': 98 | return false; 99 | case 'y': 100 | return (i == k0) ? true : !cons(i - 1); 101 | default: 102 | return true; 103 | } 104 | } 105 | 106 | /* 107 | * m() measures the number of consonant sequences between k0 and j. if c is 108 | * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 109 | * presence, 110 | * 111 | * gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3 112 | * .... 113 | */ 114 | 115 | private final int m() 116 | { 117 | int n = 0; 118 | int i = k0; 119 | while (true) 120 | { 121 | if (i > j) 122 | return n; 123 | if (!cons(i)) 124 | break; 125 | i++; 126 | } 127 | i++; 128 | while (true) 129 | { 130 | while (true) 131 | { 132 | if (i > j) 133 | return n; 134 | if (cons(i)) 135 | break; 136 | i++; 137 | } 138 | i++; 139 | n++; 140 | while (true) 141 | { 142 | if (i > j) 143 | return n; 144 | if (!cons(i)) 145 | break; 146 | i++; 147 | } 148 | i++; 149 | } 150 | } 151 | 152 | /* vowelinstem() is true <=> k0,...j contains a vowel */ 153 | 154 | private final boolean vowelinstem() 155 | { 156 | int i; 157 | for (i = k0; i <= j; i++) 158 | if (!cons(i)) 159 | return true; 160 | return false; 161 | } 162 | 163 | /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ 164 | 165 | private final boolean doublec(int j) 166 | { 167 | if (j < k0 + 1) 168 | return false; 169 | if (b[j] != b[j - 1]) 170 | return false; 171 | return cons(j); 172 | } 173 | 174 | /* 175 | * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant 176 | * and also if the second c is not w,x or y. this is used when trying to 177 | * restore an e at the end of a short word. e.g. 178 | * 179 | * cav(e), lov(e), hop(e), crim(e), but snow, box, tray. 180 | */ 181 | 182 | private final boolean cvc(int i) 183 | { 184 | if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) 185 | return false; 186 | else 187 | { 188 | int ch = b[i]; 189 | if (ch == 'w' || ch == 'x' || ch == 'y') 190 | return false; 191 | } 192 | return true; 193 | } 194 | 195 | private final boolean ends(String s) 196 | { 197 | int l = s.length(); 198 | int o = k - l + 1; 199 | if (o < k0) 200 | return false; 201 | for (int i = 0; i < l; i++) 202 | if (b[o + i] != s.charAt(i)) 203 | return false; 204 | j = k - l; 205 | return true; 206 | } 207 | 208 | /* 209 | * setto(s) sets (j+1),...k to the characters in the string s, readjusting 210 | * k. 211 | */ 212 | 213 | void setto(String s) 214 | { 215 | int l = s.length(); 216 | int o = j + 1; 217 | for (int i = 0; i < l; i++) 218 | b[o + i] = s.charAt(i); 219 | k = j + l; 220 | dirty = true; 221 | } 222 | 223 | /* r(s) is used further down. */ 224 | 225 | void r(String s) 226 | { 227 | if (m() > 0) 228 | setto(s); 229 | } 230 | 231 | /* 232 | * step1() gets rid of plurals and -ed or -ing. e.g. 233 | * 234 | * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat 235 | * 236 | * feed -> feed agreed -> agree disabled -> disable 237 | * 238 | * matting -> mat mating -> mate meeting -> meet milling -> mill messing -> 239 | * mess 240 | * 241 | * meetings -> meet 242 | */ 243 | 244 | private final void step1() 245 | { 246 | if (b[k] == 's') 247 | { 248 | if (ends("sses")) 249 | k -= 2; 250 | else if (ends("ies")) 251 | setto("i"); 252 | else if (b[k - 1] != 's') 253 | k--; 254 | } 255 | if (ends("eed")) 256 | { 257 | if (m() > 0) 258 | k--; 259 | } 260 | else if ((ends("ed") || ends("ing")) && vowelinstem()) 261 | { 262 | k = j; 263 | if (ends("at")) 264 | setto("ate"); 265 | else if (ends("bl")) 266 | setto("ble"); 267 | else if (ends("iz")) 268 | setto("ize"); 269 | else if (doublec(k)) 270 | { 271 | int ch = b[k--]; 272 | if (ch == 'l' || ch == 's' || ch == 'z') 273 | k++; 274 | } 275 | else if (m() == 1 && cvc(k)) 276 | setto("e"); 277 | } 278 | } 279 | 280 | /* step2() turns terminal y to i when there is another vowel in the stem. */ 281 | 282 | private final void step2() 283 | { 284 | if (ends("y") && vowelinstem()) 285 | { 286 | b[k] = 'i'; 287 | dirty = true; 288 | } 289 | } 290 | 291 | /* 292 | * step3() maps double suffices to single ones. so -ization ( = -ize plus 293 | * -ation) maps to -ize etc. note that the string before the suffix must 294 | * give m() > 0. 295 | */ 296 | 297 | private final void step3() 298 | { 299 | if (k == k0) 300 | return; /* For Bug 1 */ 301 | switch (b[k - 1]) 302 | { 303 | case 'a': 304 | if (ends("ational")) 305 | { 306 | r("ate"); 307 | break; 308 | } 309 | if (ends("tional")) 310 | { 311 | r("tion"); 312 | break; 313 | } 314 | break; 315 | case 'c': 316 | if (ends("enci")) 317 | { 318 | r("ence"); 319 | break; 320 | } 321 | if (ends("anci")) 322 | { 323 | r("ance"); 324 | break; 325 | } 326 | break; 327 | case 'e': 328 | if (ends("izer")) 329 | { 330 | r("ize"); 331 | break; 332 | } 333 | break; 334 | case 'l': 335 | if (ends("bli")) 336 | { 337 | r("ble"); 338 | break; 339 | } 340 | if (ends("alli")) 341 | { 342 | r("al"); 343 | break; 344 | } 345 | if (ends("entli")) 346 | { 347 | r("ent"); 348 | break; 349 | } 350 | if (ends("eli")) 351 | { 352 | r("e"); 353 | break; 354 | } 355 | if (ends("ousli")) 356 | { 357 | r("ous"); 358 | break; 359 | } 360 | break; 361 | case 'o': 362 | if (ends("ization")) 363 | { 364 | r("ize"); 365 | break; 366 | } 367 | if (ends("ation")) 368 | { 369 | r("ate"); 370 | break; 371 | } 372 | if (ends("ator")) 373 | { 374 | r("ate"); 375 | break; 376 | } 377 | break; 378 | case 's': 379 | if (ends("alism")) 380 | { 381 | r("al"); 382 | break; 383 | } 384 | if (ends("iveness")) 385 | { 386 | r("ive"); 387 | break; 388 | } 389 | if (ends("fulness")) 390 | { 391 | r("ful"); 392 | break; 393 | } 394 | if (ends("ousness")) 395 | { 396 | r("ous"); 397 | break; 398 | } 399 | break; 400 | case 't': 401 | if (ends("aliti")) 402 | { 403 | r("al"); 404 | break; 405 | } 406 | if (ends("iviti")) 407 | { 408 | r("ive"); 409 | break; 410 | } 411 | if (ends("biliti")) 412 | { 413 | r("ble"); 414 | break; 415 | } 416 | break; 417 | case 'g': 418 | if (ends("logi")) 419 | { 420 | r("log"); 421 | break; 422 | } 423 | } 424 | } 425 | 426 | /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ 427 | 428 | private final void step4() 429 | { 430 | switch (b[k]) 431 | { 432 | case 'e': 433 | if (ends("icate")) 434 | { 435 | r("ic"); 436 | break; 437 | } 438 | if (ends("ative")) 439 | { 440 | r(""); 441 | break; 442 | } 443 | if (ends("alize")) 444 | { 445 | r("al"); 446 | break; 447 | } 448 | break; 449 | case 'i': 450 | if (ends("iciti")) 451 | { 452 | r("ic"); 453 | break; 454 | } 455 | break; 456 | case 'l': 457 | if (ends("ical")) 458 | { 459 | r("ic"); 460 | break; 461 | } 462 | if (ends("ful")) 463 | { 464 | r(""); 465 | break; 466 | } 467 | break; 468 | case 's': 469 | if (ends("ness")) 470 | { 471 | r(""); 472 | break; 473 | } 474 | break; 475 | } 476 | } 477 | 478 | /* step5() takes off -ant, -ence etc., in context vcvc. */ 479 | 480 | private final void step5() 481 | { 482 | if (k == k0) 483 | return; /* for Bug 1 */ 484 | switch (b[k - 1]) 485 | { 486 | case 'a': 487 | if (ends("al")) 488 | break; 489 | return; 490 | case 'c': 491 | if (ends("ance")) 492 | break; 493 | if (ends("ence")) 494 | break; 495 | return; 496 | case 'e': 497 | if (ends("er")) 498 | break; 499 | return; 500 | case 'i': 501 | if (ends("ic")) 502 | break; 503 | return; 504 | case 'l': 505 | if (ends("able")) 506 | break; 507 | if (ends("ible")) 508 | break; 509 | return; 510 | case 'n': 511 | if (ends("ant")) 512 | break; 513 | if (ends("ement")) 514 | break; 515 | if (ends("ment")) 516 | break; 517 | /* element etc. not stripped before the m */ 518 | if (ends("ent")) 519 | break; 520 | return; 521 | case 'o': 522 | if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) 523 | break; 524 | /* j >= 0 fixes Bug 2 */ 525 | if (ends("ou")) 526 | break; 527 | return; 528 | /* takes care of -ous */ 529 | case 's': 530 | if (ends("ism")) 531 | break; 532 | return; 533 | case 't': 534 | if (ends("ate")) 535 | break; 536 | if (ends("iti")) 537 | break; 538 | return; 539 | case 'u': 540 | if (ends("ous")) 541 | break; 542 | return; 543 | case 'v': 544 | if (ends("ive")) 545 | break; 546 | return; 547 | case 'z': 548 | if (ends("ize")) 549 | break; 550 | return; 551 | default: 552 | return; 553 | } 554 | if (m() > 1) 555 | k = j; 556 | } 557 | 558 | /* step6() removes a final -e if m() > 1. */ 559 | 560 | private final void step6() 561 | { 562 | j = k; 563 | if (b[k] == 'e') 564 | { 565 | int a = m(); 566 | if (a > 1 || a == 1 && !cvc(k - 1)) 567 | k--; 568 | } 569 | if (b[k] == 'l' && doublec(k) && m() > 1) 570 | k--; 571 | } 572 | 573 | /** 574 | * Stem a word contained in a portion of a char[] array. Returns true if the 575 | * stemming process resulted in a word different from the input. You can 576 | * retrieve the result with getResultLength()/getResultBuffer() or 577 | * toString(). 578 | */ 579 | public boolean stem(char[] wordBuffer, int offset, int wordLen) 580 | { 581 | reset(); 582 | if (b.length < wordLen) 583 | { 584 | b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)]; 585 | } 586 | System.arraycopy(wordBuffer, offset, b, 0, wordLen); 587 | i = wordLen; 588 | return stem(0); 589 | } 590 | 591 | /** 592 | * Stem the word placed into the Stemmer buffer through calls to add(). 593 | * Returns true if the stemming process resulted in a word different from 594 | * the input. You can retrieve the result with 595 | * getResultLength()/getResultBuffer() or toString(). 596 | */ 597 | public boolean stem() 598 | { 599 | return stem(0); 600 | } 601 | 602 | public boolean stem(int i0) 603 | { 604 | k = i - 1; 605 | k0 = i0; 606 | if (k > k0 + 1) 607 | { 608 | step1(); 609 | step2(); 610 | step3(); 611 | step4(); 612 | step5(); 613 | step6(); 614 | } 615 | // Also, a word is considered dirty if we lopped off letters 616 | // Thanks to Ifigenia Vairelles for pointing this out. 617 | if (i != k + 1) 618 | dirty = true; 619 | i = k + 1; 620 | return dirty; 621 | } 622 | 623 | } 624 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/HanLPAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import com.hankcs.hanlp.utility.Predefine; 4 | import com.hankcs.lucene4.HanLPIndexAnalyzer; 5 | import org.elasticsearch.common.inject.Inject; 6 | import org.elasticsearch.common.inject.assistedinject.Assisted; 7 | import org.elasticsearch.common.settings.Settings; 8 | import org.elasticsearch.env.Environment; 9 | import org.elasticsearch.index.IndexSettings; 10 | import java.io.File; 11 | import java.nio.file.Path; 12 | 13 | /** 14 | */ 15 | public class HanLPAnalyzerProvider extends AbstractIndexAnalyzerProvider { 16 | 17 | private final HanLPIndexAnalyzer analyzer; 18 | 19 | @Inject 20 | public HanLPAnalyzerProvider(IndexSettings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) { 21 | super(indexSettings, name, settings); 22 | Path pluginsDir = env.pluginsFile(); 23 | Predefine.HANLP_PROPERTIES_PATH = pluginsDir.toString() + File.separator + "analysis-hanlp" + File.separator + "hanlp.properties"; 24 | analyzer = new HanLPIndexAnalyzer(true); 25 | } 26 | 27 | public static HanLPAnalyzerProvider getIndexAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 28 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings); 29 | } 30 | 31 | public static HanLPAnalyzerProvider getSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 32 | return new HanLPAnalyzerProvider(indexSettings, env, name, settings); 33 | } 34 | 35 | @Override 36 | public HanLPIndexAnalyzer get() { 37 | return this.analyzer; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/HanLPTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import com.hankcs.hanlp.utility.Predefine; 4 | import com.hankcs.lucene4.HanLPIndexAnalyzer; 5 | import org.elasticsearch.common.inject.Inject; 6 | import org.elasticsearch.common.inject.assistedinject.Assisted; 7 | import org.elasticsearch.common.settings.Settings; 8 | import org.elasticsearch.env.Environment; 9 | import org.elasticsearch.index.IndexSettings; 10 | import java.io.File; 11 | import java.nio.file.Path; 12 | 13 | /** 14 | */ 15 | public class HanLPTokenizerFactory extends AbstractTokenizerFactory { 16 | 17 | private boolean enablePorterStemming; 18 | private boolean enableIndexMode; 19 | 20 | public HanLPTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 21 | super(indexSettings, name, settings); 22 | Path pluginsDir = env.pluginsFile(); 23 | Predefine.HANLP_PROPERTIES_PATH = pluginsDir.toString() + File.separator + "analysis-hanlp" + File.separator + "hanlp.properties"; 24 | enablePorterStemming = settings.getAsBoolean("enablePorterStemming", false); 25 | } 26 | 27 | public static HanLPTokenizerFactory getIndexTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 28 | return new HanLPTokenizerFactory(indexSettings, env, name, settings).setIndexMode(true); 29 | } 30 | 31 | public static HanLPTokenizerFactory getSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 32 | return new HanLPTokenizerFactory(indexSettings, env, name, settings).setIndexMode(false); 33 | } 34 | 35 | private HanLPTokenizerFactory setIndexMode(boolean enableIndexMode) { 36 | this.enableIndexMode = enableIndexMode; 37 | return this; 38 | } 39 | 40 | @Override 41 | public Tokenizer create() { 42 | return new HanLPTokenizer(HanLP.newSegment().enablePlaceRecognize(true).enableCustomDictionary(true).enableIndexMode(enableIndexMode).enableOffset(true), null, enablePorterStemming); 43 | } 44 | 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/hanlp/AnalysisHanlpPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis.hanlp; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.index.analysis.*; 5 | import org.elasticsearch.indices.analysis.AnalysisModule; 6 | import org.elasticsearch.plugins.AnalysisPlugin; 7 | import org.elasticsearch.plugins.Plugin; 8 | 9 | import java.util.HashMap; 10 | import java.util.Map; 11 | 12 | /** 13 | * The HanLP Analysis HanLP module into elasticsearch. 14 | */ 15 | public class AnalysisHanlpPlugin extends Plugin implements AnalysisPlugin { 16 | 17 | public static String PLUGIN_NAME = "analysis-hanlp"; 18 | 19 | @Override 20 | public Map> getTokenizers() { 21 | Map> extra = new HashMap<>(); 22 | 23 | extra.put("hanlp-index", HanLPTokenizerFactory::getIndexTokenizerFactory); 24 | extra.put("hanlp-smart", HanLPTokenizerFactory::getSmartTokenizerFactory); 25 | extra.put("hanlp", HanLPTokenizerFactory::getIndexTokenizerFactory); 26 | return extra; 27 | } 28 | 29 | @Override 30 | public Map>> getAnalyzers() { 31 | Map>> extra = new HashMap<>(); 32 | 33 | extra.put("hanlp-index", HanLPAnalyzerProvider::getIndexAnalyzerProvider); 34 | extra.put("hanlp-smart", HanLPAnalyzerProvider::getSmartAnalyzerProvider); 35 | extra.put("hanlp", HanLPAnalyzerProvider::getIndexAnalyzerProvider); 36 | return extra; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/resources/hanlp.properties: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kepmov/elasticsearch-analysis-hanlp/04f35320e2dec54c2c3a0c2e6758864797ea8cfc/src/main/resources/hanlp.properties -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=5.0 29 | # java.version=1.8 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=${project.version} 38 | # 39 | # 'name': the plugin name 40 | name=${elasticsearch.plugin.name} 41 | 42 | ### mandatory elements for site plugins: 43 | # 44 | # 'site': set to true to indicate contents of the _site/ 45 | # directory in the root of the plugin should be served. 46 | site=${elasticsearch.plugin.site} 47 | # 48 | ### mandatory elements for jvm plugins : 49 | # 50 | # 'jvm': true if the 'classname' class should be loaded 51 | # from jar files in the root directory of the plugin. 52 | # Note that only jar files in the root directory are 53 | # added to the classpath for the plugin! If you need 54 | # other resources, package them into a resources jar. 55 | jvm=${elasticsearch.plugin.jvm} 56 | # 57 | # 'classname': the name of the class to load, fully-qualified. 58 | classname=${elasticsearch.plugin.classname} 59 | # 60 | # 'java.version' version of java the code is built against 61 | # use the system property java.specification.version 62 | # version string must be a sequence of nonnegative decimal integers 63 | # separated by "."'s and may have leading zeros 64 | java.version=${maven.compiler.target} 65 | # 66 | # 'elasticsearch.version' version of elasticsearch compiled against 67 | # You will have to release a new version of the plugin for each new 68 | # elasticsearch release. This version is checked when the plugin 69 | # is loaded so Elasticsearch will refuse to start in the presence of 70 | # plugins with the incorrect elasticsearch.version. 71 | elasticsearch.version=${elasticsearch.version} 72 | # 73 | ### deprecated elements for jvm plugins : 74 | # 75 | # 'isolated': true if the plugin should have its own classloader. 76 | # passing false is deprecated, and only intended to support plugins 77 | # that have hard dependencies against each other. If this is 78 | # not specified, then the plugin is isolated by default. 79 | isolated=${elasticsearch.plugin.isolated} 80 | # --------------------------------------------------------------------------------