├── .gitignore
├── init.sql
├── pom.xml
├── readme.md
└── src
├── main
├── java
│ └── com
│ │ ├── hankcs
│ │ └── lucene
│ │ │ ├── HanLPAnalyzer.java
│ │ │ ├── HanLPIndexAnalyzer.java
│ │ │ ├── HanLPTokenizer.java
│ │ │ ├── HanLPTokenizerFactory.java
│ │ │ ├── PorterStemmer.java
│ │ │ └── SegmentWrapper.java
│ │ └── watt
│ │ ├── CloudApplication.java
│ │ ├── configure
│ │ ├── DbConfig.java
│ │ ├── LuceneConfig.java
│ │ └── MybatisConfig.java
│ │ ├── core
│ │ ├── QuestionsIndex.java
│ │ ├── dictionary
│ │ │ ├── CoreAbbreviationDictionary.java
│ │ │ ├── CoreStopWordsDictionary.java
│ │ │ └── MyCustomDictionary.java
│ │ └── nlp
│ │ │ └── cosinesimlarity
│ │ │ ├── AtomSegment.java
│ │ │ ├── IDExtract.java
│ │ │ ├── SimilarityAnalyze.java
│ │ │ ├── SimilarityAnalyzeUnfamiliarWords.java
│ │ │ └── Word2Vec.java
│ │ ├── data
│ │ └── jdbc
│ │ │ ├── MySqlDataSource.java
│ │ │ ├── MySqlSessionFactoryBean.java
│ │ │ └── MySqlSessionTemplate.java
│ │ ├── mvc
│ │ ├── beans
│ │ │ ├── CheckResult.java
│ │ │ ├── PlatformResponse.java
│ │ │ └── QAAnalyzeResult.java
│ │ ├── controller
│ │ │ ├── CorpusController.java
│ │ │ └── QAController.java
│ │ ├── dao
│ │ │ ├── QADao.java
│ │ │ └── QADao.xml
│ │ └── service
│ │ │ └── QAService.java
│ │ └── util
│ │ ├── CommonUtils.java
│ │ ├── FileUtils.java
│ │ └── NLPUtils.java
├── resources
│ ├── application.yml
│ ├── hanlp.properties
│ └── mybatis.xml
└── webapp
│ ├── WEB-INF
│ ├── lib
│ │ └── hanlp-1.7.2.jar
│ └── web.xml
│ └── index.jsp
└── test
└── java
├── MapCount.java
└── wikiCorpus.java
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Maven Auto Generate ###
2 | target/
3 | !.mvn/wrapper/maven-wrapper.jar
4 |
5 | ### STS ###
6 | .apt_generated
7 | .classpath
8 | .factorypath
9 | .project
10 | .settings
11 | .springBeans
12 |
13 | ### IntelliJ IDEA ###
14 | *.idea
15 | *.iws
16 | *.iml
17 | *.ipr
18 |
19 | ### NetBeans ###
20 | nbproject/private/
21 | build/
22 | nbbuild/
23 | dist/
24 | nbdist/
25 | .nb-gradle/
26 |
27 | ### Log Files in Linux And macOS Environment ###
28 | /C:/
29 |
--------------------------------------------------------------------------------
/init.sql:
--------------------------------------------------------------------------------
1 | create table knowlede_dictionary_custom
2 | (
3 | uuid int auto_increment
4 | primary key,
5 | word varchar(40) null
6 | )
7 | charset = gb2312;
8 |
9 | create table knowledge_category
10 | (
11 | id varchar(64) not null comment '主键'
12 | primary key,
13 | create_by varchar(64) null comment '创建者',
14 | create_date datetime null comment '创建时间',
15 | update_by varchar(64) null comment '更新者',
16 | update_date datetime null comment '更新时间',
17 | remarks varchar(255) null comment '备注信息',
18 | del_flag varchar(64) null comment '逻辑删除标记(0:显示;1:隐藏)',
19 | category_name varchar(64) null comment '类别名'
20 | )
21 | comment '知识库类别表';
22 |
23 | create table knowledge_dictionary_abbreviation
24 | (
25 | id varchar(64) not null comment '主键'
26 | primary key,
27 | create_by varchar(64) null comment '创建者',
28 | create_date datetime null comment '创建时间',
29 | update_by varchar(64) null comment '更新者',
30 | update_date datetime null comment '更新时间',
31 | remarks varchar(255) null comment '备注信息',
32 | del_flag varchar(64) null comment '逻辑删除标记(0:显示;1:隐藏)',
33 | abbr_name varchar(64) null comment '简称',
34 | full_name varchar(64) null comment '全称'
35 | )
36 | comment '全简称管理';
37 |
38 | create table knowledge_dictionary_industry
39 | (
40 | id varchar(32) not null comment '主键'
41 | primary key,
42 | industry_name varchar(50) not null comment '行业名称',
43 | create_time datetime default CURRENT_TIMESTAMP not null comment '创建时间',
44 | is_valid char default 'Y' not null comment '有效标志:Y有效,N无效'
45 | )
46 | comment '行业字典表' charset = gb2312;
47 |
48 | create table knowledge_dictionary_stopwords
49 | (
50 | id int(10) auto_increment
51 | primary key,
52 | word varchar(255) null
53 | )
54 | collate = utf8_bin;
55 |
56 | create table knowledge_dictionary_synonym
57 | (
58 | id varchar(64) charset gbk not null comment '主键'
59 | primary key,
60 | create_by varchar(64) null comment '创建者',
61 | create_date datetime null comment '创建时间',
62 | update_by varchar(64) null comment '更新者',
63 | update_date datetime null comment '更新时间',
64 | remarks varchar(255) null comment '备注信息',
65 | del_flag varchar(64) not null comment '逻辑删除标记(0:显示;1:隐藏)',
66 | synonym varchar(1000) not null comment '同义词',
67 | type varchar(1) not null comment '类型近似和相等'
68 | )
69 | comment '同义词词典';
70 |
71 | create table knowledge_qa_answer
72 | (
73 | id varchar(64) not null comment '主键'
74 | primary key,
75 | create_by varchar(64) null comment '创建者',
76 | create_date datetime null comment '创建时间',
77 | update_by varchar(64) null comment '更新者',
78 | update_date datetime null comment '更新时间',
79 | remarks varchar(255) null comment '备注信息',
80 | del_flag varchar(64) null comment '逻辑删除标记(0:显示;1:隐藏)',
81 | answer longtext null comment '答案',
82 | reference_id varchar(64) null comment '媒体类型引用',
83 | media_type varchar(4) null comment '媒体类型',
84 | category_id varchar(64) null comment '类别id'
85 | )
86 | comment '问答答案表';
87 |
88 | create table knowledge_qa_logs
89 | (
90 | id varchar(64) not null comment '主键'
91 | primary key,
92 | create_by varchar(64) null comment '创建者',
93 | create_date datetime default CURRENT_TIMESTAMP null comment '创建时间',
94 | update_by varchar(64) null comment '更新者',
95 | update_date datetime null comment '更新时间',
96 | remarks varchar(255) null comment '备注信息',
97 | del_flag varchar(64) default '0' null comment '逻辑删除标记(0:显示;1:隐藏)',
98 | question varchar(128) null comment '问题',
99 | score varchar(64) null comment '评分',
100 | channel_id varchar(64) null comment '授权ID,接入渠道(微信,机器人等)',
101 | question_id varchar(64) null comment '问题ID'
102 | )
103 | comment '日志表';
104 |
105 | create table knowledge_qa_media
106 | (
107 | MEDIA_ID int auto_increment comment '素材ID'
108 | primary key,
109 | MEDIA_NAME varchar(100) not null comment '素材名称',
110 | MEDIA_SUMMARY varchar(2048) null comment '素材摘要',
111 | MEDIA_TYPE varchar(3) null comment '素材类型(GT:图文 IMG:图片 AU:语音 VI:视频)',
112 | MEDIA_URL varchar(512) null comment '素材链接(若本字段非空,素材文件中存储为素材的封面图片)',
113 | CREATE_TIME datetime not null comment '创建时间',
114 | UPDATE_TIME datetime null comment '更新时间'
115 | )
116 | comment '素材表';
117 |
118 | create table knowledge_qa_question
119 | (
120 | id varchar(64) not null comment '主键'
121 | primary key,
122 | create_by varchar(64) null comment '创建者',
123 | create_date datetime null comment '创建时间',
124 | update_by varchar(64) null comment '更新者',
125 | update_date datetime null comment '更新时间',
126 | remarks varchar(255) null comment '备注信息',
127 | del_flag varchar(64) default '0' null comment '逻辑删除标记(0:显示;1:隐藏)',
128 | question varchar(64) null comment '问题',
129 | answer_id varchar(64) null comment '答案id'
130 | )
131 | comment '知识库问答答案表';
132 |
133 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | 4.0.0
6 |
7 | com.inspur.tax
8 | knowledge-core
9 | 1.0-SNAPSHOT
10 | war
11 |
12 | knowledge-core Maven Webapp
13 |
14 | UTF-8
15 | 1.8
16 | 1.8
17 | 7.4.0
18 | 1.2.35
19 | 8.0.11
20 | 5.0.8.RELEASE
21 | 0.9.5.3
22 | 1.3.2
23 | 3.4.4
24 | 7.4.0
25 | 1.7.2
26 |
27 |
28 | org.springframework.boot
29 | spring-boot-starter-parent
30 | 2.0.4.RELEASE
31 |
32 |
33 |
34 |
35 | junit
36 | junit
37 | 4.11
38 | test
39 |
40 |
41 |
42 | org.jsoup
43 | jsoup
44 | 1.11.3
45 | test
46 |
47 |
48 | com.alibaba
49 | fastjson
50 | ${fastjson.version}
51 |
52 |
53 |
54 | org.apache.lucene
55 | lucene-queryparser
56 | ${lucene.version}
57 |
58 |
59 | org.apache.lucene
60 | lucene-highlighter
61 | ${lucene.version}
62 | test
63 |
64 |
65 | org.apache.lucene
66 | lucene-analyzers-common
67 | ${lucene.version}
68 |
69 |
70 |
71 | org.springframework.boot
72 | spring-boot-starter
73 |
74 |
75 | org.springframework.boot
76 | spring-boot-starter-web
77 |
78 |
79 | org.springframework.boot
80 | spring-boot-configuration-processor
81 | true
82 |
83 |
84 |
85 | org.mybatis
86 | mybatis
87 | ${mybatis.version}
88 |
89 |
90 | org.mybatis
91 | mybatis-spring
92 | ${mybatis-spring.version}
93 |
94 |
95 | com.mchange
96 | c3p0
97 | ${c3p0.version}
98 |
99 |
100 |
101 | org.springframework
102 | spring-jdbc
103 | ${spring.version}
104 |
105 |
106 |
107 | mysql
108 | mysql-connector-java
109 | ${mysql.connector}
110 |
111 |
112 |
113 | com.hankcs
114 | hanlp
115 | ${hanlp.version}
116 | system
117 | ${project.basedir}/src/main/webapp/WEB-INF/lib/hanlp-${hanlp.version}.jar
118 |
119 |
120 |
121 |
122 | knowledge
123 |
124 |
125 | src/main/java
126 |
127 | **/*.properties
128 | **/*.xml
129 |
130 | false
131 |
132 |
133 | src/main/resources
134 |
135 | **/
136 |
137 | false
138 |
139 |
140 | src/main/webapp
141 |
142 | **/*.properties
143 | **/*.xml
144 |
145 | false
146 |
147 |
148 |
149 | src/main/webapp/WEB-INF
150 | BOOT-INF/lib/
151 |
152 | **/*.jar
153 |
154 |
155 |
156 |
157 |
158 | org.springframework.boot
159 | spring-boot-maven-plugin
160 |
161 |
162 |
163 | org.apache.maven.plugins
164 | maven-surefire-plugin
165 |
166 | true
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | # 咨询知识库系统概要
2 | 感谢HanLp、Luncene等开源系统给我们开发系统带来的便捷,也诚挚的邀请各位大神参与完善这个项目来供更多的人学习使用
3 | ##安装部署
4 | ###语料数据准备:
5 | 百度网盘下载地址:https://pan.baidu.com/s/1Syhk2Ehv_5Of19bHlFNSig
6 | 提取码:04ay
7 | ###数据库初始化
8 | 数据库采用mysql进行静态的数据存储,请在网盘中一并下载
9 | ```
10 | init.sql
11 | ```
12 | ###项目构建工具
13 | 项目采用maven的项目管理工具管理,并且采用的SpringBoot微服务框架开发。此处建议使用idea、eclipse等工具进行编辑开发。
14 | ###配置说明
15 | ####HanLp配置说明(hanlp.properties):
16 | 项目中只需要配置这个root就可以了,root参数是HanLp分词数据包的物理路径
17 | ``` properties
18 | root=/root/
19 | ```
20 | ####Lucene、词向量项目配置说明
21 | application.yml
22 | ``` yaml
23 | lucene:
24 | root: /root/lucene/ #Lucene索引位置的根目录
25 | indexKey: questionWithSynonyms #这个是Lucene查询、建立索引的时候共享的一个key,这个key可以一直不改变
26 | vectorPath: /root/data/wiki_chinese_word2vec.bin #词向量物理路径
27 | ```
28 | ####mysql数据库配置
29 | ```yaml
30 | db:
31 | mysql:
32 | driverClass: com.mysql.cj.jdbc.Driver#这个是mysql驱动配置不需要改动
33 | jdbcUrl: jdbc:mysql://IP:3306/tax_knowledge?useUnicode=true&characterEncoding=gb2312 #mysql地址端口号配置
34 | user: username #mysql用户名和密码配置
35 | password: password #mysql用户名和密码配置
36 | ```
37 | ###项目编译
38 | 执行maven命令,如果不懂maven的童鞋,请恶补一下基础知识,哈哈
39 | ```
40 | mvn clean install
41 | ```
42 | ###启动应用
43 | springboot项目启动只需要启动编译好的编译包就可以了,不懂springboot的童鞋要使劲学习啦。
44 | ```
45 | java -jar knowledge.war
46 | ```
47 | ###创建索引库
48 | 通过浏览器访问创建索引的接口:
49 | ```
50 | http://ip:8080/createIndex
51 | ```
52 | ###测试结果
53 | 浏览器访问
54 | ```
55 | http://ip:8080/getAnswer?question=收不到验证码
56 | ```
57 | ## 语义相似度(余弦相似度分析、词林)
58 | 语义相似度计算采用余弦相似度计算的方法,针对专业性的知识库,经验证,余弦相似度最适合,调用方式(SimilarityAnalyzeUnfamiliarWords)为:
59 | ``` java
60 | double score = similarAnalyze.sentenceSimilarity(seg_question, seg_question2);
61 | //计算第一句话得每一个词和另一句话中最相似的词的相似度
62 | for (Term sentence1Word : sentence1Words) {
63 | ++count1;
64 | sum1 += this.calMaxSimilarity(sentence1Word.word, sentence2Words);
65 | }
66 | //计算第二句话得每一个词和另一句话中最相似的词的相似度
67 | for (Term sentence2Word : sentence2Words) {
68 | ++count2;
69 | sum2 += this.calMaxSimilarity(sentence2Word.word, sentence1Words);
70 | }
71 | //检测数量是不是为0是为了避免计算过程中产生NAN导致报错
72 | if (count1 == 0) {
73 | if (count2 == 0) {
74 | return 0F;
75 | } else {
76 | return sum2 / count2;
77 | }
78 | } else if (count2 == 0) {
79 | return sum1 / count1;
80 | }
81 | //去相似度最小的那个,能够避免长短文本比较而产生文本包含关系的问题
82 | return Math.min(sum1 / (count1), sum2 / count2);
83 | ```
84 | 相似度计算实现原理:
85 |
86 | 相似度计算代码实现:
87 | ```
88 | for (int i = 0; i < vec1.length; ++i) {
89 | dist += vec1[i] * vec2[i];
90 | sum1 += Math.pow(vec1[i], 2);
91 | sum2 += Math.pow(vec2[i], 2);
92 | }
93 | double result = dist / Math.sqrt(sum1 * sum2);
94 | //在计算过程中,由于浮点运算的偏差问题,存在比较小的误差,
95 | //为避免大于1这种情况,对后续计算过程中的英雄,暂时的将相似度控制到100%以内
96 | return result > 1.0 ? 1.0D : result;
97 | ```
98 | ## 程序功能代码入口
99 | 所有的程序功能入口都在QAController类中
100 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/HanLPAnalyzer.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import org.apache.lucene.analysis.Analyzer;
5 | import org.apache.lucene.analysis.Tokenizer;
6 |
7 | import java.util.Set;
8 |
9 | public class HanLPAnalyzer extends Analyzer {
10 | private boolean enablePorterStemming;
11 | private Set filter;
12 |
13 | /**
14 | * @param filter 停用词
15 | * @param enablePorterStemming 是否分析词干(仅限英文)
16 | */
17 | public HanLPAnalyzer(Set filter, boolean enablePorterStemming) {
18 | this.filter = filter;
19 | this.enablePorterStemming = enablePorterStemming;
20 | }
21 |
22 | /**
23 | * @param enablePorterStemming 是否分析词干.进行单复数,时态的转换
24 | */
25 | public HanLPAnalyzer(boolean enablePorterStemming) {
26 | this.enablePorterStemming = enablePorterStemming;
27 | }
28 |
29 | public HanLPAnalyzer() {
30 | super();
31 | }
32 |
33 | /**
34 | * 重载Analyzer接口,构造分词组件
35 | */
36 | @Override
37 | protected TokenStreamComponents createComponents(String fieldName) {
38 | Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment(), filter, enablePorterStemming);
39 | return new TokenStreamComponents(tokenizer);
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/HanLPIndexAnalyzer.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import org.apache.lucene.analysis.Analyzer;
5 | import org.apache.lucene.analysis.Tokenizer;
6 |
7 | import java.util.Set;
8 |
9 | public class HanLPIndexAnalyzer extends Analyzer {
10 |
11 | private boolean pstemming;
12 | private Set filter;
13 |
14 | /**
15 | * @param filter 停用词
16 | * @param pstemming 是否分析词干
17 | */
18 | public HanLPIndexAnalyzer(Set filter, boolean pstemming) {
19 | this.filter = filter;
20 | this.pstemming = pstemming;
21 | }
22 |
23 | /**
24 | * @param pstemming 是否分析词干.进行单复数,时态的转换
25 | */
26 | public HanLPIndexAnalyzer(boolean pstemming) {
27 | this.pstemming = pstemming;
28 | }
29 |
30 | public HanLPIndexAnalyzer() {
31 | super();
32 | }
33 |
34 | @Override
35 | protected TokenStreamComponents createComponents(String fieldName) {
36 | Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableIndexMode(true), filter, pstemming);
37 | return new TokenStreamComponents(tokenizer);
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/HanLPTokenizer.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene;
2 |
3 |
4 | import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
5 | import com.hankcs.hanlp.corpus.tag.Nature;
6 | import com.hankcs.hanlp.seg.Segment;
7 | import com.hankcs.hanlp.seg.common.Term;
8 | import com.hankcs.hanlp.utility.TextUtility;
9 | import org.apache.lucene.analysis.Tokenizer;
10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
12 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
14 |
15 | import java.io.BufferedReader;
16 | import java.io.IOException;
17 | import java.util.Set;
18 |
19 | /**
20 | * Tokenizer,抄袭ansj的
21 | */
22 | public class HanLPTokenizer extends Tokenizer {
23 | // 当前词
24 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
25 | // 偏移量
26 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
27 | // 距离
28 | private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
29 | private final PorterStemmer stemmer = new PorterStemmer();
30 | // 词性
31 | private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
32 | private SegmentWrapper segment;
33 | private BinTrie filter;
34 | private boolean enablePorterStemming;
35 | /**
36 | * 单文档当前所在的总offset,当reset(切换multi-value fields中的value)的时候不清零,在end(切换field)时清零
37 | */
38 | private int totalOffset = 0;
39 |
40 | /**
41 | * @param segment HanLP中的某个分词器
42 | * @param filter 停用词
43 | * @param enablePorterStemming 英文原型转换
44 | */
45 | public HanLPTokenizer(Segment segment, Set filter, boolean enablePorterStemming) {
46 | super();
47 | this.segment = new SegmentWrapper(input, segment);
48 | if (filter != null && filter.size() > 0) {
49 | this.filter = new BinTrie();
50 | for (String stopWord : filter) {
51 | this.filter.put(stopWord, null);
52 | }
53 | }
54 | this.enablePorterStemming = enablePorterStemming;
55 | }
56 |
57 | @Override
58 | final public boolean incrementToken() throws IOException {
59 | clearAttributes();
60 | int position = 0;
61 | Term term;
62 | boolean un_increased = true;
63 | do { //循环过滤到干扰项字符。有标点以及空格等
64 | term = segment.next();
65 | if (term == null) {
66 | break;
67 | }
68 | if (TextUtility.isBlank(term.word)) { // 过滤掉空白符,提高索引效率
69 | continue;
70 | }
71 | if (enablePorterStemming && term.nature == Nature.nx) {
72 | term.word = stemmer.stem(term.word);
73 | }
74 |
75 | if (filter != null && filter.containsKey(term.word)) {
76 | continue;
77 | } else {
78 | ++position;
79 | un_increased = false;
80 | }
81 | }
82 | while (un_increased);
83 |
84 | if (term != null) {
85 | positionAttr.setPositionIncrement(position);
86 | termAtt.setEmpty().append(term.word);
87 | offsetAtt.setOffset(correctOffset(totalOffset + term.offset),
88 | correctOffset(totalOffset + term.offset + term.word.length()));
89 | typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
90 | return true;
91 | } else {
92 | totalOffset += segment.offset;
93 | return false;
94 | }
95 | }
96 |
97 | @Override
98 | public void end() throws IOException {
99 | super.end();
100 | offsetAtt.setOffset(totalOffset, totalOffset);
101 | totalOffset = 0;
102 | }
103 |
104 | /**
105 | * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
106 | */
107 | @Override
108 | public void reset() throws IOException {
109 | super.reset();
110 | segment.reset(new BufferedReader(this.input));
111 | }
112 |
113 | }
114 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import com.hankcs.hanlp.corpus.io.IOUtil;
5 | import com.hankcs.hanlp.seg.Segment;
6 | import com.hankcs.hanlp.seg.common.Term;
7 | import com.hankcs.hanlp.tokenizer.TraditionalChineseTokenizer;
8 | import org.apache.lucene.analysis.Tokenizer;
9 | import org.apache.lucene.analysis.util.TokenizerFactory;
10 | import org.apache.lucene.util.AttributeFactory;
11 |
12 | import java.util.List;
13 | import java.util.Map;
14 | import java.util.Set;
15 | import java.util.TreeSet;
16 |
17 | public class HanLPTokenizerFactory extends TokenizerFactory {
18 | private boolean enableIndexMode;
19 | private boolean enablePorterStemming;
20 | private boolean enableNumberQuantifierRecognize;
21 | private boolean enableCustomDictionary;
22 | private boolean enableCustomDictionaryForcing;
23 | private boolean enableTranslatedNameRecognize;
24 | private boolean enableJapaneseNameRecognize;
25 | private boolean enableOrganizationRecognize;
26 | private boolean enablePlaceRecognize;
27 | private boolean enableNameRecognize;
28 | private boolean enableTraditionalChineseMode;
29 | private String algorithm;
30 | private Set stopWordDictionary;
31 |
32 | /**
33 | * 初始化工厂类
34 | *
35 | * @param args 通过这个Map保存xml中的配置项
36 | */
37 | public HanLPTokenizerFactory(Map args) {
38 | super(args);
39 | enableIndexMode = getBoolean(args, "enableIndexMode", true);
40 | enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
41 | enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
42 | enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
43 | enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", true);
44 | enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
45 | enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
46 | enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
47 | enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
48 | enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
49 | enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
50 | HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
51 | algorithm = getString(args, "algorithm", "viterbi");
52 | Set customDictionaryPathSet = getSet(args, "customDictionaryPath");
53 | if (customDictionaryPathSet != null) {
54 | HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
55 | }
56 | String stopWordDictionaryPath = get(args, "stopWordDictionaryPath");
57 | if (stopWordDictionaryPath != null) {
58 | stopWordDictionary = new TreeSet<>();
59 | stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
60 | }
61 | if (getBoolean(args, "enableDebug", false)) {
62 | HanLP.Config.enableDebug();
63 | }
64 | }
65 |
66 | protected final String getString(Map args, String name, String defaultVal) {
67 | String s = args.remove(name);
68 | return s == null ? defaultVal : s;
69 | }
70 |
71 | @Override
72 | public Tokenizer create(AttributeFactory factory) {
73 | Segment segment = HanLP.newSegment(algorithm).enableOffset(true).enableIndexMode(enableIndexMode)
74 | .enableNameRecognize(enableNameRecognize)
75 | .enableNumberQuantifierRecognize(enableNumberQuantifierRecognize)
76 | .enableCustomDictionary(enableCustomDictionary)
77 | .enableCustomDictionaryForcing(enableCustomDictionaryForcing)
78 | .enableTranslatedNameRecognize(enableTranslatedNameRecognize)
79 | .enableJapaneseNameRecognize(enableJapaneseNameRecognize)
80 | .enableOrganizationRecognize(enableOrganizationRecognize)
81 | .enablePlaceRecognize(enablePlaceRecognize);
82 | if (enableTraditionalChineseMode) {
83 | segment.enableIndexMode(false);
84 | Segment inner = segment;
85 | TraditionalChineseTokenizer.SEGMENT = inner;
86 | segment = new Segment() {
87 | @Override
88 | protected List segSentence(char[] sentence) {
89 | List termList = TraditionalChineseTokenizer.segment(new String(sentence));
90 | return termList;
91 | }
92 | };
93 | }
94 |
95 | return new HanLPTokenizer(segment
96 | , stopWordDictionary, enablePorterStemming);
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/PorterStemmer.java:
--------------------------------------------------------------------------------
1 | package com.hankcs.lucene;
2 |
3 | import org.apache.lucene.util.ArrayUtil;
4 |
5 | import java.io.FileInputStream;
6 | import java.io.IOException;
7 | import java.io.InputStream;
8 |
9 | import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
10 |
11 | /**
12 | * 抄袭lucene的英文处理
13 | * Stemmer, implementing the Porter Stemming Algorithm
14 | *
15 | * The Stemmer class transforms a word into its root form. The input word can be
16 | * provided a character at time (by calling add()), or at once by calling one of
17 | * the various stem(something) methods.
18 | */
19 |
20 | public class PorterStemmer {
21 | private static final int INITIAL_SIZE = 50;
22 | private char[] b;
23 | private int i, /* offset into b */
24 | j, k, k0;
25 | private boolean dirty = false;
26 |
27 | public PorterStemmer() {
28 | b = new char[INITIAL_SIZE];
29 | i = 0;
30 | }
31 |
32 | /**
33 | * Test program for demonstrating the Stemmer. It reads a file and stems
34 | * each word, writing the result to standard out. Usage: Stemmer file-name
35 | */
36 | public static void main(String[] args) {
37 | PorterStemmer s = new PorterStemmer();
38 |
39 | for (String arg : args) {
40 | try (InputStream in = new FileInputStream(arg)) {
41 | byte[] buffer = new byte[1024];
42 | int bufferLen, offset, ch;
43 |
44 | bufferLen = in.read(buffer);
45 | offset = 0;
46 | s.reset();
47 |
48 | while (true) {
49 | if (offset < bufferLen) ch = buffer[offset++];
50 | else {
51 | bufferLen = in.read(buffer);
52 | offset = 0;
53 | if (bufferLen < 0) ch = -1;
54 | else ch = buffer[offset++];
55 | }
56 |
57 | if (Character.isLetter((char) ch)) {
58 | s.add(Character.toLowerCase((char) ch));
59 | } else {
60 | s.stem();
61 | System.out.print(s.toString());
62 | s.reset();
63 | if (ch < 0) break;
64 | else {
65 | System.out.print((char) ch);
66 | }
67 | }
68 | }
69 | } catch (IOException e) {
70 | System.out.println("error reading " + arg);
71 | }
72 | }
73 | }
74 |
75 | /**
76 | * reset() resets the stemmer so it can stem another word. If you invoke the
77 | * stemmer by calling add(char) and then stem(), you must call reset()
78 | * before starting another word.
79 | */
80 | public void reset() {
81 | i = 0;
82 | dirty = false;
83 | }
84 |
85 | /**
86 | * Add a character to the word being stemmed. When you are finished adding
87 | * characters, you can call stem(void) to process the word.
88 | */
89 | public void add(char ch) {
90 | if (b.length <= i) {
91 | b = ArrayUtil.grow(b, i + 1);
92 | }
93 | b[i++] = ch;
94 | }
95 |
96 | /**
97 | * After a word has been stemmed, it can be retrieved by toString(), or a
98 | * reference to the internal buffer can be retrieved by getResultBuffer and
99 | * getResultLength (which is generally more efficient.)
100 | */
101 | @Override
102 | public String toString() {
103 | return new String(b, 0, i);
104 | }
105 |
106 | /**
107 | * Returns the length of the word resulting from the stemming process.
108 | */
109 | public int getResultLength() {
110 | return i;
111 | }
112 |
113 | /* cons(i) is true <=> b[i] is a consonant. */
114 |
115 | /**
116 | * Returns a reference to a character buffer containing the results of the
117 | * stemming process. You also need to consult getResultLength() to determine
118 | * the length of the result.
119 | */
120 | public char[] getResultBuffer() {
121 | return b;
122 | }
123 |
124 | /*
125 | * m() measures the number of consonant sequences between k0 and j. if c is
126 | * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
127 | * presence,
128 | *
129 | * gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3
130 | * ....
131 | */
132 |
133 | private boolean cons(int i) {
134 | switch (b[i]) {
135 | case 'a':
136 | case 'e':
137 | case 'i':
138 | case 'o':
139 | case 'u':
140 | return false;
141 | case 'y':
142 | return (i == k0) || !cons(i - 1);
143 | default:
144 | return true;
145 | }
146 | }
147 |
148 | /* vowelinstem() is true <=> k0,...j contains a vowel */
149 |
150 | private int m() {
151 | int n = 0;
152 | int i = k0;
153 | while (true) {
154 | if (i > j) return n;
155 | if (!cons(i)) break;
156 | i++;
157 | }
158 | i++;
159 | while (true) {
160 | while (true) {
161 | if (i > j) return n;
162 | if (cons(i)) break;
163 | i++;
164 | }
165 | i++;
166 | n++;
167 | while (true) {
168 | if (i > j) return n;
169 | if (!cons(i)) break;
170 | i++;
171 | }
172 | i++;
173 | }
174 | }
175 |
176 | /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
177 |
178 | private boolean vowelinstem() {
179 | int i;
180 | for (i = k0; i <= j; i++)
181 | if (!cons(i)) return true;
182 | return false;
183 | }
184 |
185 | /*
186 | * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
187 | * and also if the second c is not w,x or y. this is used when trying to
188 | * restore an e at the end of a short word. e.g.
189 | *
190 | * cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
191 | */
192 |
193 | private boolean doublec(int j) {
194 | if (j < k0 + 1) return false;
195 | if (b[j] != b[j - 1]) return false;
196 | return cons(j);
197 | }
198 |
199 | private boolean cvc(int i) {
200 | if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) return false;
201 | else {
202 | int ch = b[i];
203 | if (ch == 'w' || ch == 'x' || ch == 'y') return false;
204 | }
205 | return true;
206 | }
207 |
208 | /*
209 | * setto(s) sets (j+1),...k to the characters in the string s, readjusting
210 | * k.
211 | */
212 |
213 | private boolean ends(String s) {
214 | int l = s.length();
215 | int o = k - l + 1;
216 | if (o < k0) return false;
217 | for (int i = 0; i < l; i++)
218 | if (b[o + i] != s.charAt(i)) return false;
219 | j = k - l;
220 | return true;
221 | }
222 |
223 | /* r(s) is used further down. */
224 |
225 | void setto(String s) {
226 | int l = s.length();
227 | int o = j + 1;
228 | for (int i = 0; i < l; i++)
229 | b[o + i] = s.charAt(i);
230 | k = j + l;
231 | dirty = true;
232 | }
233 |
234 | /*
235 | * step1() gets rid of plurals and -ed or -ing. e.g.
236 | *
237 | * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
238 | *
239 | * feed -> feed agreed -> agree disabled -> disable
240 | *
241 | * matting -> mat mating -> mate meeting -> meet milling -> mill messing ->
242 | * mess
243 | *
244 | * meetings -> meet
245 | */
246 |
247 | void r(String s) {
248 | if (m() > 0) setto(s);
249 | }
250 |
251 | /* step2() turns terminal y to i when there is another vowel in the stem. */
252 |
253 | private void step1() {
254 | if (b[k] == 's') {
255 | if (ends("sses")) k -= 2;
256 | else if (ends("ies")) setto("i");
257 | else if (b[k - 1] != 's') k--;
258 | }
259 | if (ends("eed")) {
260 | if (m() > 0) k--;
261 | } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
262 | k = j;
263 | if (ends("at")) setto("ate");
264 | else if (ends("bl")) setto("ble");
265 | else if (ends("iz")) setto("ize");
266 | else if (doublec(k)) {
267 | int ch = b[k--];
268 | if (ch == 'l' || ch == 's' || ch == 'z') k++;
269 | } else if (m() == 1 && cvc(k)) setto("e");
270 | }
271 | }
272 |
273 | /*
274 | * step3() maps double suffices to single ones. so -ization ( = -ize plus
275 | * -ation) maps to -ize etc. note that the string before the suffix must
276 | * give m() > 0.
277 | */
278 |
279 | private void step2() {
280 | if (ends("y") && vowelinstem()) {
281 | b[k] = 'i';
282 | dirty = true;
283 | }
284 | }
285 |
286 | /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
287 |
288 | private void step3() {
289 | if (k == k0) return; /* For Bug 1 */
290 | switch (b[k - 1]) {
291 | case 'a':
292 | if (ends("ational")) {
293 | r("ate");
294 | break;
295 | }
296 | if (ends("tional")) {
297 | r("tion");
298 | break;
299 | }
300 | break;
301 | case 'c':
302 | if (ends("enci")) {
303 | r("ence");
304 | break;
305 | }
306 | if (ends("anci")) {
307 | r("ance");
308 | break;
309 | }
310 | break;
311 | case 'e':
312 | if (ends("izer")) {
313 | r("ize");
314 | break;
315 | }
316 | break;
317 | case 'l':
318 | if (ends("bli")) {
319 | r("ble");
320 | break;
321 | }
322 | if (ends("alli")) {
323 | r("al");
324 | break;
325 | }
326 | if (ends("entli")) {
327 | r("ent");
328 | break;
329 | }
330 | if (ends("eli")) {
331 | r("e");
332 | break;
333 | }
334 | if (ends("ousli")) {
335 | r("ous");
336 | break;
337 | }
338 | break;
339 | case 'o':
340 | if (ends("ization")) {
341 | r("ize");
342 | break;
343 | }
344 | if (ends("ation")) {
345 | r("ate");
346 | break;
347 | }
348 | if (ends("ator")) {
349 | r("ate");
350 | break;
351 | }
352 | break;
353 | case 's':
354 | if (ends("alism")) {
355 | r("al");
356 | break;
357 | }
358 | if (ends("iveness")) {
359 | r("ive");
360 | break;
361 | }
362 | if (ends("fulness")) {
363 | r("ful");
364 | break;
365 | }
366 | if (ends("ousness")) {
367 | r("ous");
368 | break;
369 | }
370 | break;
371 | case 't':
372 | if (ends("aliti")) {
373 | r("al");
374 | break;
375 | }
376 | if (ends("iviti")) {
377 | r("ive");
378 | break;
379 | }
380 | if (ends("biliti")) {
381 | r("ble");
382 | break;
383 | }
384 | break;
385 | case 'g':
386 | if (ends("logi")) {
387 | r("log");
388 | break;
389 | }
390 | }
391 | }
392 |
393 | /* step5() takes off -ant, -ence etc., in context vcvc. */
394 |
395 | private void step4() {
396 | switch (b[k]) {
397 | case 'e':
398 | if (ends("icate")) {
399 | r("ic");
400 | break;
401 | }
402 | if (ends("ative")) {
403 | r("");
404 | break;
405 | }
406 | if (ends("alize")) {
407 | r("al");
408 | break;
409 | }
410 | break;
411 | case 'i':
412 | if (ends("iciti")) {
413 | r("ic");
414 | break;
415 | }
416 | break;
417 | case 'l':
418 | if (ends("ical")) {
419 | r("ic");
420 | break;
421 | }
422 | if (ends("ful")) {
423 | r("");
424 | break;
425 | }
426 | break;
427 | case 's':
428 | if (ends("ness")) {
429 | r("");
430 | break;
431 | }
432 | break;
433 | }
434 | }
435 |
436 | /* step6() removes a final -e if m() > 1. */
437 |
438 | private void step5() {
439 | if (k == k0) return; /* for Bug 1 */
440 | switch (b[k - 1]) {
441 | case 'a':
442 | if (ends("al")) break;
443 | return;
444 | case 'c':
445 | if (ends("ance")) break;
446 | if (ends("ence")) break;
447 | return;
448 | case 'e':
449 | if (ends("er")) break;
450 | return;
451 | case 'i':
452 | if (ends("ic")) break;
453 | return;
454 | case 'l':
455 | if (ends("able")) break;
456 | if (ends("ible")) break;
457 | return;
458 | case 'n':
459 | if (ends("ant")) break;
460 | if (ends("ement")) break;
461 | if (ends("ment")) break;
462 | /* element etc. not stripped before the m */
463 | if (ends("ent")) break;
464 | return;
465 | case 'o':
466 | if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
467 | /* j >= 0 fixes Bug 2 */
468 | if (ends("ou")) break;
469 | return;
470 | /* takes care of -ous */
471 | case 's':
472 | if (ends("ism")) break;
473 | return;
474 | case 't':
475 | if (ends("ate")) break;
476 | if (ends("iti")) break;
477 | return;
478 | case 'u':
479 | if (ends("ous")) break;
480 | return;
481 | case 'v':
482 | if (ends("ive")) break;
483 | return;
484 | case 'z':
485 | if (ends("ize")) break;
486 | return;
487 | default:
488 | return;
489 | }
490 | if (m() > 1) k = j;
491 | }
492 |
493 | private void step6() {
494 | j = k;
495 | if (b[k] == 'e') {
496 | int a = m();
497 | if (a > 1 || a == 1 && !cvc(k - 1)) k--;
498 | }
499 | if (b[k] == 'l' && doublec(k) && m() > 1) k--;
500 | }
501 |
502 | /**
503 | * Stem a word provided as a String. Returns the result as a String.
504 | */
505 | public String stem(String s) {
506 | if (stem(s.toCharArray(), s.length())) return toString();
507 | else return s;
508 | }
509 |
510 | /**
511 | * Stem a word contained in a char[]. Returns true if the stemming process
512 | * resulted in a word different from the input. You can retrieve the result
513 | * with getResultLength()/getResultBuffer() or toString().
514 | */
515 | public boolean stem(char[] word) {
516 | return stem(word, word.length);
517 | }
518 |
519 | /**
520 | * Stem a word contained in a portion of a char[] array. Returns true if the
521 | * stemming process resulted in a word different from the input. You can
522 | * retrieve the result with getResultLength()/getResultBuffer() or
523 | * toString().
524 | */
525 | public boolean stem(char[] wordBuffer, int offset, int wordLen) {
526 | reset();
527 | if (b.length < wordLen) {
528 | b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
529 | }
530 | System.arraycopy(wordBuffer, offset, b, 0, wordLen);
531 | i = wordLen;
532 | return stem(0);
533 | }
534 |
535 | /**
536 | * Stem a word contained in a leading portion of a char[] array. Returns
537 | * true if the stemming process resulted in a word different from the input.
538 | * You can retrieve the result with getResultLength()/getResultBuffer() or
539 | * toString().
540 | */
541 | public boolean stem(char[] word, int wordLen) {
542 | return stem(word, 0, wordLen);
543 | }
544 |
545 | /**
546 | * Stem the word placed into the Stemmer buffer through calls to add().
547 | * Returns true if the stemming process resulted in a word different from
548 | * the input. You can retrieve the result with
549 | * getResultLength()/getResultBuffer() or toString().
550 | */
551 | public boolean stem() {
552 | return stem(0);
553 | }
554 |
555 | public boolean stem(int i0) {
556 | k = i - 1;
557 | k0 = i0;
558 | if (k > k0 + 1) {
559 | step1();
560 | step2();
561 | step3();
562 | step4();
563 | step5();
564 | step6();
565 | }
566 | // Also, a word is considered dirty if we lopped off letters
567 | // Thanks to Ifigenia Vairelles for pointing this out.
568 | if (i != k + 1) dirty = true;
569 | i = k + 1;
570 | return dirty;
571 | }
572 |
573 | }
574 |
--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/SegmentWrapper.java:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * hankcs
4 | * me@hankcs.com
5 | * 2015/10/6 18:51
6 | *
7 | *
8 | * Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/
9 | *
10 | */
11 | package com.hankcs.lucene;
12 |
13 | import com.hankcs.hanlp.seg.Segment;
14 | import com.hankcs.hanlp.seg.common.Term;
15 |
16 | import java.io.IOException;
17 | import java.io.Reader;
18 | import java.util.HashSet;
19 | import java.util.Iterator;
20 | import java.util.List;
21 | import java.util.Set;
22 |
23 | /**
24 | * 将分词器包装起来,每次输出一个token
25 | *
26 | * @author hankcs
27 | */
28 | public class SegmentWrapper {
29 | /**
30 | * 缓冲区大小
31 | */
32 | private static final int BUFFER_SIZE = 512;
33 | /**
34 | * 句子分隔符
35 | */
36 | private static final Set delimiterCharSet = new HashSet() {{
37 | add('\r');
38 | add('\n');
39 | add(';');
40 | add(';');
41 | add('。');
42 | add('!');
43 | add('!');
44 | }};
45 | /**
46 | * term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正
47 | */
48 | int offset;
49 | /**
50 | * 输入
51 | */
52 | private Reader input;
53 | /**
54 | * 分词器
55 | */
56 | private Segment segment;
57 | /**
58 | * 分词结果
59 | */
60 | private Iterator iterator;
61 | /**
62 | * 缓冲区
63 | */
64 | private char[] buffer = new char[BUFFER_SIZE];
65 | /**
66 | * 缓冲区未处理的下标
67 | */
68 | private int remainSize = 0;
69 |
70 | public SegmentWrapper(Reader reader, Segment segment) {
71 | this.input = reader;
72 | this.segment = segment;
73 | }
74 |
75 | /**
76 | * 重置分词器
77 | *
78 | * @param reader
79 | */
80 | public void reset(Reader reader) {
81 | input = reader;
82 | offset = 0;
83 | iterator = null;
84 | }
85 |
86 | public Term next() throws IOException {
87 | if (iterator != null && iterator.hasNext()) return iterator.next();
88 | String line = readLine();
89 | if (line == null) return null;
90 | List termList = segment.seg(line);
91 | if (termList.size() == 0) return null;
92 | for (Term term : termList) {
93 | term.offset += offset;
94 | }
95 | offset += line.length();
96 | iterator = termList.iterator();
97 | return iterator.next();
98 | }
99 |
100 | private String readLine() throws IOException {
101 | int offset = 0;
102 | int length = BUFFER_SIZE;
103 | if (remainSize > 0) {
104 | offset = remainSize;
105 | length -= remainSize;
106 | }
107 | int n = input.read(buffer, offset, length);
108 | if (n < 0) {
109 | if (remainSize != 0) {
110 | String lastLine = new String(buffer, 0, remainSize);
111 | remainSize = 0;
112 | return lastLine;
113 | }
114 | return null;
115 | }
116 | n += offset;
117 |
118 | int eos = lastIndexOfEos(buffer, n);
119 | String line = new String(buffer, 0, eos);
120 | remainSize = n - eos;
121 | System.arraycopy(buffer, eos, buffer, 0, remainSize);
122 | return line;
123 | }
124 |
125 | private int lastIndexOfEos(char[] buffer, int length) {
126 | for (int i = length - 1; i > 0; i--) {
127 | if (delimiterCharSet.contains(buffer[i])) {
128 | return i + 1;
129 | }
130 | }
131 | return length;
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/src/main/java/com/watt/CloudApplication.java:
--------------------------------------------------------------------------------
1 | package com.watt;
2 |
3 | import org.springframework.boot.SpringApplication;
4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
5 |
6 | @SpringBootApplication
7 | public class CloudApplication {
8 | public static void main(String[] args) {
9 | SpringApplication.run(CloudApplication.class, args);
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/main/java/com/watt/configure/DbConfig.java:
--------------------------------------------------------------------------------
1 | package com.watt.configure;
2 |
3 | import org.springframework.boot.context.properties.ConfigurationProperties;
4 | import org.springframework.context.annotation.Configuration;
5 |
6 | @Configuration
7 | @ConfigurationProperties(prefix = "db.mysql")
8 | public class DbConfig {
9 | private String driverClass;
10 | private String jdbcUrl;
11 | private String user;
12 | private String password;
13 |
14 | public String getDriverClass() {
15 | return driverClass;
16 | }
17 |
18 | public void setDriverClass(String driverClass) {
19 | this.driverClass = driverClass;
20 | }
21 |
22 | public String getJdbcUrl() {
23 | return jdbcUrl;
24 | }
25 |
26 | public void setJdbcUrl(String jdbcUrl) {
27 | this.jdbcUrl = jdbcUrl;
28 | }
29 |
30 | public String getUser() {
31 | return user;
32 | }
33 |
34 | public void setUser(String user) {
35 | this.user = user;
36 | }
37 |
38 | public String getPassword() {
39 | return password;
40 | }
41 |
42 | public void setPassword(String password) {
43 | this.password = password;
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/java/com/watt/configure/LuceneConfig.java:
--------------------------------------------------------------------------------
1 | package com.watt.configure;
2 |
3 | import org.apache.lucene.index.DirectoryReader;
4 | import org.apache.lucene.search.IndexSearcher;
5 | import org.apache.lucene.store.Directory;
6 | import org.apache.lucene.store.FSDirectory;
7 | import org.slf4j.Logger;
8 | import org.slf4j.LoggerFactory;
9 | import org.springframework.boot.context.properties.ConfigurationProperties;
10 | import org.springframework.context.annotation.Configuration;
11 |
12 | import java.io.IOException;
13 | import java.nio.file.FileSystems;
14 |
15 | @Configuration
16 | @ConfigurationProperties(prefix = "lucene")
17 | public class LuceneConfig {
18 | private static Directory directory;
19 | private String root;
20 | private static DirectoryReader reader;
21 | private final Logger logger = LoggerFactory.getLogger(this.getClass());
22 | private String indexKey;
23 | private String vectorPath;
24 | private String tfidfPath;
25 |
26 | private Directory getDirectory() {
27 | try {
28 | directory = FSDirectory.open(FileSystems.getDefault().getPath(root));
29 | } catch (IOException e) {
30 | logger.error("directory对象打开失败");
31 | return null;
32 | }
33 | return directory;
34 | }
35 |
36 | public IndexSearcher getIndexSearcher() {
37 | Directory directory = getDirectory();
38 | if (directory == null) {
39 | return null;
40 | }
41 | try {
42 | reader = DirectoryReader.open(directory);
43 | DirectoryReader tr = DirectoryReader.openIfChanged(reader);
44 | if (tr != null) {
45 | reader.close();
46 | reader = tr;
47 | }
48 |
49 | return new IndexSearcher(reader);
50 | } catch (IOException e) {
51 | logger.error("indexReader打开失败,不能继续");
52 | }
53 | return null;
54 | }
55 |
56 | public String getRoot() {
57 | return root;
58 | }
59 |
60 | public void setRoot(String root) {
61 | this.root = root;
62 | }
63 |
64 | public String getIndexKey() {
65 | return indexKey;
66 | }
67 |
68 | public void setIndexKey(String indexKey) {
69 | this.indexKey = indexKey;
70 | }
71 |
72 | public String getVectorPath() {
73 | return vectorPath;
74 | }
75 |
76 | public void setVectorPath(String vectorPath) {
77 | this.vectorPath = vectorPath;
78 | }
79 |
80 | public String getTfidfPath() {
81 | return tfidfPath;
82 | }
83 |
84 | public void setTfidfPath(String tfidfPath) {
85 | this.tfidfPath = tfidfPath;
86 | }
87 | }
88 |
89 |
--------------------------------------------------------------------------------
/src/main/java/com/watt/configure/MybatisConfig.java:
--------------------------------------------------------------------------------
1 | package com.watt.configure;
2 |
3 | import org.springframework.boot.context.properties.ConfigurationProperties;
4 | import org.springframework.context.annotation.Configuration;
5 |
6 | @Configuration
7 | @ConfigurationProperties(prefix = "db.mybatis")
8 | public class MybatisConfig {
9 | private String mybatisXml;
10 |
11 | public String getMybatisXml() {
12 | return mybatisXml;
13 | }
14 |
15 | public void setMybatisXml(String mybatisXml) {
16 | this.mybatisXml = mybatisXml;
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/main/java/com/watt/core/QuestionsIndex.java:
--------------------------------------------------------------------------------
1 | package com.watt.core;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import com.hankcs.hanlp.corpus.synonym.Synonym;
5 | import com.hankcs.hanlp.dictionary.CoreSynonymDictionary;
6 | import com.hankcs.hanlp.dictionary.common.CommonSynonymDictionary;
7 | import com.hankcs.hanlp.seg.Segment;
8 | import com.hankcs.hanlp.seg.common.Term;
9 | import com.hankcs.lucene.HanLPIndexAnalyzer;
10 | import com.watt.configure.LuceneConfig;
11 | import com.watt.mvc.service.QAService;
12 | import com.watt.util.FileUtils;
13 | import org.apache.lucene.document.Document;
14 | import org.apache.lucene.document.Field;
15 | import org.apache.lucene.document.TextField;
16 | import org.apache.lucene.index.IndexWriter;
17 | import org.apache.lucene.index.IndexWriterConfig;
18 | import org.apache.lucene.store.Directory;
19 | import org.apache.lucene.store.FSDirectory;
20 | import org.slf4j.Logger;
21 | import org.slf4j.LoggerFactory;
22 | import org.springframework.beans.factory.annotation.Autowired;
23 | import org.springframework.stereotype.Component;
24 |
25 | import java.io.IOException;
26 | import java.nio.file.FileSystems;
27 | import java.util.List;
28 | import java.util.Map;
29 | import java.util.concurrent.atomic.AtomicInteger;
30 |
31 | /**
32 | * lucene索引创建保存相关
33 | */
34 | @Component
35 | public class QuestionsIndex {
36 | private final Logger logger = LoggerFactory.getLogger(this.getClass());
37 | private final QAService qaService;
38 | private final LuceneConfig luceneConfig;
39 | private Segment segment = HanLP.newSegment();
40 |
41 | @Autowired
42 | public QuestionsIndex(QAService qaService, LuceneConfig luceneConfig) {
43 | this.qaService = qaService;
44 | this.luceneConfig = luceneConfig;
45 | }
46 |
47 | /**
48 | * 初始化,所以库
49 | */
50 | public void createIndex() {
51 |
52 | FileUtils.clearPath(luceneConfig.getRoot());
53 | IndexWriter writer;
54 | Directory directory;
55 | IndexWriterConfig iwc = new IndexWriterConfig(new HanLPIndexAnalyzer());
56 | //创建目录 directory
57 | try {
58 | directory = FSDirectory.open(FileSystems.getDefault().getPath(luceneConfig.getRoot()));
59 | writer = new IndexWriter(directory, iwc);
60 | } catch (IOException e) {
61 | logger.info("Lucene目录打开异常");
62 | return;
63 | }
64 |
65 | int start = 0;
66 | int pageSize = 1000;
67 | List