├── .gitignore ├── init.sql ├── pom.xml ├── readme.md └── src ├── main ├── java │ └── com │ │ ├── hankcs │ │ └── lucene │ │ │ ├── HanLPAnalyzer.java │ │ │ ├── HanLPIndexAnalyzer.java │ │ │ ├── HanLPTokenizer.java │ │ │ ├── HanLPTokenizerFactory.java │ │ │ ├── PorterStemmer.java │ │ │ └── SegmentWrapper.java │ │ └── watt │ │ ├── CloudApplication.java │ │ ├── configure │ │ ├── DbConfig.java │ │ ├── LuceneConfig.java │ │ └── MybatisConfig.java │ │ ├── core │ │ ├── QuestionsIndex.java │ │ ├── dictionary │ │ │ ├── CoreAbbreviationDictionary.java │ │ │ ├── CoreStopWordsDictionary.java │ │ │ └── MyCustomDictionary.java │ │ └── nlp │ │ │ └── cosinesimlarity │ │ │ ├── AtomSegment.java │ │ │ ├── IDExtract.java │ │ │ ├── SimilarityAnalyze.java │ │ │ ├── SimilarityAnalyzeUnfamiliarWords.java │ │ │ └── Word2Vec.java │ │ ├── data │ │ └── jdbc │ │ │ ├── MySqlDataSource.java │ │ │ ├── MySqlSessionFactoryBean.java │ │ │ └── MySqlSessionTemplate.java │ │ ├── mvc │ │ ├── beans │ │ │ ├── CheckResult.java │ │ │ ├── PlatformResponse.java │ │ │ └── QAAnalyzeResult.java │ │ ├── controller │ │ │ ├── CorpusController.java │ │ │ └── QAController.java │ │ ├── dao │ │ │ ├── QADao.java │ │ │ └── QADao.xml │ │ └── service │ │ │ └── QAService.java │ │ └── util │ │ ├── CommonUtils.java │ │ ├── FileUtils.java │ │ └── NLPUtils.java ├── resources │ ├── application.yml │ ├── hanlp.properties │ └── mybatis.xml └── webapp │ ├── WEB-INF │ ├── lib │ │ └── hanlp-1.7.2.jar │ └── web.xml │ └── index.jsp └── test └── java ├── MapCount.java └── wikiCorpus.java /.gitignore: -------------------------------------------------------------------------------- 1 | ### Maven Auto Generate ### 2 | target/ 3 | !.mvn/wrapper/maven-wrapper.jar 4 | 5 | ### STS ### 6 | .apt_generated 7 | .classpath 8 | .factorypath 9 | .project 10 | .settings 11 | .springBeans 12 | 13 | ### IntelliJ IDEA ### 14 | *.idea 15 | *.iws 16 | *.iml 17 | *.ipr 18 | 19 | ### NetBeans ### 20 | nbproject/private/ 21 | build/ 22 | nbbuild/ 23 | dist/ 24 | nbdist/ 25 | .nb-gradle/ 26 | 27 | ### Log Files in Linux And macOS Environment ### 28 | /C:/ 29 | -------------------------------------------------------------------------------- /init.sql: -------------------------------------------------------------------------------- 1 | create table knowlede_dictionary_custom 2 | ( 3 | uuid int auto_increment 4 | primary key, 5 | word varchar(40) null 6 | ) 7 | charset = gb2312; 8 | 9 | create table knowledge_category 10 | ( 11 | id varchar(64) not null comment '主键' 12 | primary key, 13 | create_by varchar(64) null comment '创建者', 14 | create_date datetime null comment '创建时间', 15 | update_by varchar(64) null comment '更新者', 16 | update_date datetime null comment '更新时间', 17 | remarks varchar(255) null comment '备注信息', 18 | del_flag varchar(64) null comment '逻辑删除标记(0:显示;1:隐藏)', 19 | category_name varchar(64) null comment '类别名' 20 | ) 21 | comment '知识库类别表'; 22 | 23 | create table knowledge_dictionary_abbreviation 24 | ( 25 | id varchar(64) not null comment '主键' 26 | primary key, 27 | create_by varchar(64) null comment '创建者', 28 | create_date datetime null comment '创建时间', 29 | update_by varchar(64) null comment '更新者', 30 | update_date datetime null comment '更新时间', 31 | remarks varchar(255) null comment '备注信息', 32 | del_flag varchar(64) null comment '逻辑删除标记(0:显示;1:隐藏)', 33 | abbr_name varchar(64) null comment '简称', 34 | full_name varchar(64) null comment '全称' 35 | ) 36 | comment '全简称管理'; 37 | 38 | create table knowledge_dictionary_industry 39 | ( 40 | id varchar(32) not null comment '主键' 41 | primary key, 42 | industry_name varchar(50) not null comment '行业名称', 43 | create_time datetime default CURRENT_TIMESTAMP not null comment '创建时间', 44 | is_valid char default 'Y' not null comment '有效标志:Y有效,N无效' 45 | ) 46 | comment '行业字典表' charset = gb2312; 47 | 48 | create table knowledge_dictionary_stopwords 49 | ( 50 | id int(10) auto_increment 51 | primary key, 52 | word varchar(255) null 53 | ) 54 | collate = utf8_bin; 55 | 56 | create table knowledge_dictionary_synonym 57 | ( 58 | id varchar(64) charset gbk not null comment '主键' 59 | primary key, 60 | create_by varchar(64) null comment '创建者', 61 | create_date datetime null comment '创建时间', 62 | update_by varchar(64) null comment '更新者', 63 | update_date datetime null comment '更新时间', 64 | remarks varchar(255) null comment '备注信息', 65 | del_flag varchar(64) not null comment '逻辑删除标记(0:显示;1:隐藏)', 66 | synonym varchar(1000) not null comment '同义词', 67 | type varchar(1) not null comment '类型近似和相等' 68 | ) 69 | comment '同义词词典'; 70 | 71 | create table knowledge_qa_answer 72 | ( 73 | id varchar(64) not null comment '主键' 74 | primary key, 75 | create_by varchar(64) null comment '创建者', 76 | create_date datetime null comment '创建时间', 77 | update_by varchar(64) null comment '更新者', 78 | update_date datetime null comment '更新时间', 79 | remarks varchar(255) null comment '备注信息', 80 | del_flag varchar(64) null comment '逻辑删除标记(0:显示;1:隐藏)', 81 | answer longtext null comment '答案', 82 | reference_id varchar(64) null comment '媒体类型引用', 83 | media_type varchar(4) null comment '媒体类型', 84 | category_id varchar(64) null comment '类别id' 85 | ) 86 | comment '问答答案表'; 87 | 88 | create table knowledge_qa_logs 89 | ( 90 | id varchar(64) not null comment '主键' 91 | primary key, 92 | create_by varchar(64) null comment '创建者', 93 | create_date datetime default CURRENT_TIMESTAMP null comment '创建时间', 94 | update_by varchar(64) null comment '更新者', 95 | update_date datetime null comment '更新时间', 96 | remarks varchar(255) null comment '备注信息', 97 | del_flag varchar(64) default '0' null comment '逻辑删除标记(0:显示;1:隐藏)', 98 | question varchar(128) null comment '问题', 99 | score varchar(64) null comment '评分', 100 | channel_id varchar(64) null comment '授权ID,接入渠道(微信,机器人等)', 101 | question_id varchar(64) null comment '问题ID' 102 | ) 103 | comment '日志表'; 104 | 105 | create table knowledge_qa_media 106 | ( 107 | MEDIA_ID int auto_increment comment '素材ID' 108 | primary key, 109 | MEDIA_NAME varchar(100) not null comment '素材名称', 110 | MEDIA_SUMMARY varchar(2048) null comment '素材摘要', 111 | MEDIA_TYPE varchar(3) null comment '素材类型(GT:图文 IMG:图片 AU:语音 VI:视频)', 112 | MEDIA_URL varchar(512) null comment '素材链接(若本字段非空,素材文件中存储为素材的封面图片)', 113 | CREATE_TIME datetime not null comment '创建时间', 114 | UPDATE_TIME datetime null comment '更新时间' 115 | ) 116 | comment '素材表'; 117 | 118 | create table knowledge_qa_question 119 | ( 120 | id varchar(64) not null comment '主键' 121 | primary key, 122 | create_by varchar(64) null comment '创建者', 123 | create_date datetime null comment '创建时间', 124 | update_by varchar(64) null comment '更新者', 125 | update_date datetime null comment '更新时间', 126 | remarks varchar(255) null comment '备注信息', 127 | del_flag varchar(64) default '0' null comment '逻辑删除标记(0:显示;1:隐藏)', 128 | question varchar(64) null comment '问题', 129 | answer_id varchar(64) null comment '答案id' 130 | ) 131 | comment '知识库问答答案表'; 132 | 133 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 4.0.0 6 | 7 | com.inspur.tax 8 | knowledge-core 9 | 1.0-SNAPSHOT 10 | war 11 | 12 | knowledge-core Maven Webapp 13 | 14 | UTF-8 15 | 1.8 16 | 1.8 17 | 7.4.0 18 | 1.2.35 19 | 8.0.11 20 | 5.0.8.RELEASE 21 | 0.9.5.3 22 | 1.3.2 23 | 3.4.4 24 | 7.4.0 25 | 1.7.2 26 | 27 | 28 | org.springframework.boot 29 | spring-boot-starter-parent 30 | 2.0.4.RELEASE 31 | 32 | 33 | 34 | 35 | junit 36 | junit 37 | 4.11 38 | test 39 | 40 | 41 | 42 | org.jsoup 43 | jsoup 44 | 1.11.3 45 | test 46 | 47 | 48 | com.alibaba 49 | fastjson 50 | ${fastjson.version} 51 | 52 | 53 | 54 | org.apache.lucene 55 | lucene-queryparser 56 | ${lucene.version} 57 | 58 | 59 | org.apache.lucene 60 | lucene-highlighter 61 | ${lucene.version} 62 | test 63 | 64 | 65 | org.apache.lucene 66 | lucene-analyzers-common 67 | ${lucene.version} 68 | 69 | 70 | 71 | org.springframework.boot 72 | spring-boot-starter 73 | 74 | 75 | org.springframework.boot 76 | spring-boot-starter-web 77 | 78 | 79 | org.springframework.boot 80 | spring-boot-configuration-processor 81 | true 82 | 83 | 84 | 85 | org.mybatis 86 | mybatis 87 | ${mybatis.version} 88 | 89 | 90 | org.mybatis 91 | mybatis-spring 92 | ${mybatis-spring.version} 93 | 94 | 95 | com.mchange 96 | c3p0 97 | ${c3p0.version} 98 | 99 | 100 | 101 | org.springframework 102 | spring-jdbc 103 | ${spring.version} 104 | 105 | 106 | 107 | mysql 108 | mysql-connector-java 109 | ${mysql.connector} 110 | 111 | 112 | 113 | com.hankcs 114 | hanlp 115 | ${hanlp.version} 116 | system 117 | ${project.basedir}/src/main/webapp/WEB-INF/lib/hanlp-${hanlp.version}.jar 118 | 119 | 120 | 121 | 122 | knowledge 123 | 124 | 125 | src/main/java 126 | 127 | **/*.properties 128 | **/*.xml 129 | 130 | false 131 | 132 | 133 | src/main/resources 134 | 135 | **/ 136 | 137 | false 138 | 139 | 140 | src/main/webapp 141 | 142 | **/*.properties 143 | **/*.xml 144 | 145 | false 146 | 147 | 148 | 149 | src/main/webapp/WEB-INF 150 | BOOT-INF/lib/ 151 | 152 | **/*.jar 153 | 154 | 155 | 156 | 157 | 158 | org.springframework.boot 159 | spring-boot-maven-plugin 160 | 161 | 162 | 163 | org.apache.maven.plugins 164 | maven-surefire-plugin 165 | 166 | true 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # 咨询知识库系统概要 2 | 感谢HanLp、Luncene等开源系统给我们开发系统带来的便捷,也诚挚的邀请各位大神参与完善这个项目来供更多的人学习使用 3 | ##安装部署 4 | ###语料数据准备: 5 | 百度网盘下载地址:https://pan.baidu.com/s/1Syhk2Ehv_5Of19bHlFNSig 6 | 提取码:04ay 7 | ###数据库初始化 8 | 数据库采用mysql进行静态的数据存储,请在网盘中一并下载 9 | ``` 10 | init.sql 11 | ``` 12 | ###项目构建工具 13 | 项目采用maven的项目管理工具管理,并且采用的SpringBoot微服务框架开发。此处建议使用idea、eclipse等工具进行编辑开发。 14 | ###配置说明 15 | ####HanLp配置说明(hanlp.properties): 16 | 项目中只需要配置这个root就可以了,root参数是HanLp分词数据包的物理路径 17 | ``` properties 18 | root=/root/ 19 | ``` 20 | ####Lucene、词向量项目配置说明 21 | application.yml 22 | ``` yaml 23 | lucene: 24 | root: /root/lucene/ #Lucene索引位置的根目录 25 | indexKey: questionWithSynonyms #这个是Lucene查询、建立索引的时候共享的一个key,这个key可以一直不改变 26 | vectorPath: /root/data/wiki_chinese_word2vec.bin #词向量物理路径 27 | ``` 28 | ####mysql数据库配置 29 | ```yaml 30 | db: 31 | mysql: 32 | driverClass: com.mysql.cj.jdbc.Driver#这个是mysql驱动配置不需要改动 33 | jdbcUrl: jdbc:mysql://IP:3306/tax_knowledge?useUnicode=true&characterEncoding=gb2312 #mysql地址端口号配置 34 | user: username #mysql用户名和密码配置 35 | password: password #mysql用户名和密码配置 36 | ``` 37 | ###项目编译 38 | 执行maven命令,如果不懂maven的童鞋,请恶补一下基础知识,哈哈 39 | ``` 40 | mvn clean install 41 | ``` 42 | ###启动应用 43 | springboot项目启动只需要启动编译好的编译包就可以了,不懂springboot的童鞋要使劲学习啦。 44 | ``` 45 | java -jar knowledge.war 46 | ``` 47 | ###创建索引库 48 | 通过浏览器访问创建索引的接口: 49 | ``` 50 | http://ip:8080/createIndex 51 | ``` 52 | ###测试结果 53 | 浏览器访问 54 | ``` 55 | http://ip:8080/getAnswer?question=收不到验证码 56 | ``` 57 | ## 语义相似度(余弦相似度分析、词林) 58 | 语义相似度计算采用余弦相似度计算的方法,针对专业性的知识库,经验证,余弦相似度最适合,调用方式(SimilarityAnalyzeUnfamiliarWords)为: 59 | ``` java 60 | double score = similarAnalyze.sentenceSimilarity(seg_question, seg_question2); 61 | //计算第一句话得每一个词和另一句话中最相似的词的相似度 62 | for (Term sentence1Word : sentence1Words) { 63 | ++count1; 64 | sum1 += this.calMaxSimilarity(sentence1Word.word, sentence2Words); 65 | } 66 | //计算第二句话得每一个词和另一句话中最相似的词的相似度 67 | for (Term sentence2Word : sentence2Words) { 68 | ++count2; 69 | sum2 += this.calMaxSimilarity(sentence2Word.word, sentence1Words); 70 | } 71 | //检测数量是不是为0是为了避免计算过程中产生NAN导致报错 72 | if (count1 == 0) { 73 | if (count2 == 0) { 74 | return 0F; 75 | } else { 76 | return sum2 / count2; 77 | } 78 | } else if (count2 == 0) { 79 | return sum1 / count1; 80 | } 81 | //去相似度最小的那个,能够避免长短文本比较而产生文本包含关系的问题 82 | return Math.min(sum1 / (count1), sum2 / count2); 83 | ``` 84 | 相似度计算实现原理: 85 | 86 | 相似度计算代码实现: 87 | ``` 88 | for (int i = 0; i < vec1.length; ++i) { 89 | dist += vec1[i] * vec2[i]; 90 | sum1 += Math.pow(vec1[i], 2); 91 | sum2 += Math.pow(vec2[i], 2); 92 | } 93 | double result = dist / Math.sqrt(sum1 * sum2); 94 | //在计算过程中,由于浮点运算的偏差问题,存在比较小的误差, 95 | //为避免大于1这种情况,对后续计算过程中的英雄,暂时的将相似度控制到100%以内 96 | return result > 1.0 ? 1.0D : result; 97 | ``` 98 | ## 程序功能代码入口 99 | 所有的程序功能入口都在QAController类中 100 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene/HanLPAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import org.apache.lucene.analysis.Analyzer; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | 7 | import java.util.Set; 8 | 9 | public class HanLPAnalyzer extends Analyzer { 10 | private boolean enablePorterStemming; 11 | private Set filter; 12 | 13 | /** 14 | * @param filter 停用词 15 | * @param enablePorterStemming 是否分析词干(仅限英文) 16 | */ 17 | public HanLPAnalyzer(Set filter, boolean enablePorterStemming) { 18 | this.filter = filter; 19 | this.enablePorterStemming = enablePorterStemming; 20 | } 21 | 22 | /** 23 | * @param enablePorterStemming 是否分析词干.进行单复数,时态的转换 24 | */ 25 | public HanLPAnalyzer(boolean enablePorterStemming) { 26 | this.enablePorterStemming = enablePorterStemming; 27 | } 28 | 29 | public HanLPAnalyzer() { 30 | super(); 31 | } 32 | 33 | /** 34 | * 重载Analyzer接口,构造分词组件 35 | */ 36 | @Override 37 | protected TokenStreamComponents createComponents(String fieldName) { 38 | Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment(), filter, enablePorterStemming); 39 | return new TokenStreamComponents(tokenizer); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene/HanLPIndexAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import org.apache.lucene.analysis.Analyzer; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | 7 | import java.util.Set; 8 | 9 | public class HanLPIndexAnalyzer extends Analyzer { 10 | 11 | private boolean pstemming; 12 | private Set filter; 13 | 14 | /** 15 | * @param filter 停用词 16 | * @param pstemming 是否分析词干 17 | */ 18 | public HanLPIndexAnalyzer(Set filter, boolean pstemming) { 19 | this.filter = filter; 20 | this.pstemming = pstemming; 21 | } 22 | 23 | /** 24 | * @param pstemming 是否分析词干.进行单复数,时态的转换 25 | */ 26 | public HanLPIndexAnalyzer(boolean pstemming) { 27 | this.pstemming = pstemming; 28 | } 29 | 30 | public HanLPIndexAnalyzer() { 31 | super(); 32 | } 33 | 34 | @Override 35 | protected TokenStreamComponents createComponents(String fieldName) { 36 | Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableIndexMode(true), filter, pstemming); 37 | return new TokenStreamComponents(tokenizer); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene/HanLPTokenizer.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene; 2 | 3 | 4 | import com.hankcs.hanlp.collection.trie.bintrie.BinTrie; 5 | import com.hankcs.hanlp.corpus.tag.Nature; 6 | import com.hankcs.hanlp.seg.Segment; 7 | import com.hankcs.hanlp.seg.common.Term; 8 | import com.hankcs.hanlp.utility.TextUtility; 9 | import org.apache.lucene.analysis.Tokenizer; 10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 12 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 14 | 15 | import java.io.BufferedReader; 16 | import java.io.IOException; 17 | import java.util.Set; 18 | 19 | /** 20 | * Tokenizer,抄袭ansj的 21 | */ 22 | public class HanLPTokenizer extends Tokenizer { 23 | // 当前词 24 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 25 | // 偏移量 26 | private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); 27 | // 距离 28 | private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class); 29 | private final PorterStemmer stemmer = new PorterStemmer(); 30 | // 词性 31 | private TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 32 | private SegmentWrapper segment; 33 | private BinTrie filter; 34 | private boolean enablePorterStemming; 35 | /** 36 | * 单文档当前所在的总offset,当reset(切换multi-value fields中的value)的时候不清零,在end(切换field)时清零 37 | */ 38 | private int totalOffset = 0; 39 | 40 | /** 41 | * @param segment HanLP中的某个分词器 42 | * @param filter 停用词 43 | * @param enablePorterStemming 英文原型转换 44 | */ 45 | public HanLPTokenizer(Segment segment, Set filter, boolean enablePorterStemming) { 46 | super(); 47 | this.segment = new SegmentWrapper(input, segment); 48 | if (filter != null && filter.size() > 0) { 49 | this.filter = new BinTrie(); 50 | for (String stopWord : filter) { 51 | this.filter.put(stopWord, null); 52 | } 53 | } 54 | this.enablePorterStemming = enablePorterStemming; 55 | } 56 | 57 | @Override 58 | final public boolean incrementToken() throws IOException { 59 | clearAttributes(); 60 | int position = 0; 61 | Term term; 62 | boolean un_increased = true; 63 | do { //循环过滤到干扰项字符。有标点以及空格等 64 | term = segment.next(); 65 | if (term == null) { 66 | break; 67 | } 68 | if (TextUtility.isBlank(term.word)) { // 过滤掉空白符,提高索引效率 69 | continue; 70 | } 71 | if (enablePorterStemming && term.nature == Nature.nx) { 72 | term.word = stemmer.stem(term.word); 73 | } 74 | 75 | if (filter != null && filter.containsKey(term.word)) { 76 | continue; 77 | } else { 78 | ++position; 79 | un_increased = false; 80 | } 81 | } 82 | while (un_increased); 83 | 84 | if (term != null) { 85 | positionAttr.setPositionIncrement(position); 86 | termAtt.setEmpty().append(term.word); 87 | offsetAtt.setOffset(correctOffset(totalOffset + term.offset), 88 | correctOffset(totalOffset + term.offset + term.word.length())); 89 | typeAtt.setType(term.nature == null ? "null" : term.nature.toString()); 90 | return true; 91 | } else { 92 | totalOffset += segment.offset; 93 | return false; 94 | } 95 | } 96 | 97 | @Override 98 | public void end() throws IOException { 99 | super.end(); 100 | offsetAtt.setOffset(totalOffset, totalOffset); 101 | totalOffset = 0; 102 | } 103 | 104 | /** 105 | * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败 106 | */ 107 | @Override 108 | public void reset() throws IOException { 109 | super.reset(); 110 | segment.reset(new BufferedReader(this.input)); 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import com.hankcs.hanlp.corpus.io.IOUtil; 5 | import com.hankcs.hanlp.seg.Segment; 6 | import com.hankcs.hanlp.seg.common.Term; 7 | import com.hankcs.hanlp.tokenizer.TraditionalChineseTokenizer; 8 | import org.apache.lucene.analysis.Tokenizer; 9 | import org.apache.lucene.analysis.util.TokenizerFactory; 10 | import org.apache.lucene.util.AttributeFactory; 11 | 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.Set; 15 | import java.util.TreeSet; 16 | 17 | public class HanLPTokenizerFactory extends TokenizerFactory { 18 | private boolean enableIndexMode; 19 | private boolean enablePorterStemming; 20 | private boolean enableNumberQuantifierRecognize; 21 | private boolean enableCustomDictionary; 22 | private boolean enableCustomDictionaryForcing; 23 | private boolean enableTranslatedNameRecognize; 24 | private boolean enableJapaneseNameRecognize; 25 | private boolean enableOrganizationRecognize; 26 | private boolean enablePlaceRecognize; 27 | private boolean enableNameRecognize; 28 | private boolean enableTraditionalChineseMode; 29 | private String algorithm; 30 | private Set stopWordDictionary; 31 | 32 | /** 33 | * 初始化工厂类 34 | * 35 | * @param args 通过这个Map保存xml中的配置项 36 | */ 37 | public HanLPTokenizerFactory(Map args) { 38 | super(args); 39 | enableIndexMode = getBoolean(args, "enableIndexMode", true); 40 | enablePorterStemming = getBoolean(args, "enablePorterStemming", false); 41 | enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false); 42 | enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true); 43 | enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", true); 44 | enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false); 45 | enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false); 46 | enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false); 47 | enableNameRecognize = getBoolean(args, "enableNameRecognize", false); 48 | enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false); 49 | enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false); 50 | HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization); 51 | algorithm = getString(args, "algorithm", "viterbi"); 52 | Set customDictionaryPathSet = getSet(args, "customDictionaryPath"); 53 | if (customDictionaryPathSet != null) { 54 | HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]); 55 | } 56 | String stopWordDictionaryPath = get(args, "stopWordDictionaryPath"); 57 | if (stopWordDictionaryPath != null) { 58 | stopWordDictionary = new TreeSet<>(); 59 | stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath)); 60 | } 61 | if (getBoolean(args, "enableDebug", false)) { 62 | HanLP.Config.enableDebug(); 63 | } 64 | } 65 | 66 | protected final String getString(Map args, String name, String defaultVal) { 67 | String s = args.remove(name); 68 | return s == null ? defaultVal : s; 69 | } 70 | 71 | @Override 72 | public Tokenizer create(AttributeFactory factory) { 73 | Segment segment = HanLP.newSegment(algorithm).enableOffset(true).enableIndexMode(enableIndexMode) 74 | .enableNameRecognize(enableNameRecognize) 75 | .enableNumberQuantifierRecognize(enableNumberQuantifierRecognize) 76 | .enableCustomDictionary(enableCustomDictionary) 77 | .enableCustomDictionaryForcing(enableCustomDictionaryForcing) 78 | .enableTranslatedNameRecognize(enableTranslatedNameRecognize) 79 | .enableJapaneseNameRecognize(enableJapaneseNameRecognize) 80 | .enableOrganizationRecognize(enableOrganizationRecognize) 81 | .enablePlaceRecognize(enablePlaceRecognize); 82 | if (enableTraditionalChineseMode) { 83 | segment.enableIndexMode(false); 84 | Segment inner = segment; 85 | TraditionalChineseTokenizer.SEGMENT = inner; 86 | segment = new Segment() { 87 | @Override 88 | protected List segSentence(char[] sentence) { 89 | List termList = TraditionalChineseTokenizer.segment(new String(sentence)); 90 | return termList; 91 | } 92 | }; 93 | } 94 | 95 | return new HanLPTokenizer(segment 96 | , stopWordDictionary, enablePorterStemming); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene/PorterStemmer.java: -------------------------------------------------------------------------------- 1 | package com.hankcs.lucene; 2 | 3 | import org.apache.lucene.util.ArrayUtil; 4 | 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | 9 | import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR; 10 | 11 | /** 12 | * 抄袭lucene的英文处理 13 | * Stemmer, implementing the Porter Stemming Algorithm 14 | *

15 | * The Stemmer class transforms a word into its root form. The input word can be 16 | * provided a character at time (by calling add()), or at once by calling one of 17 | * the various stem(something) methods. 18 | */ 19 | 20 | public class PorterStemmer { 21 | private static final int INITIAL_SIZE = 50; 22 | private char[] b; 23 | private int i, /* offset into b */ 24 | j, k, k0; 25 | private boolean dirty = false; 26 | 27 | public PorterStemmer() { 28 | b = new char[INITIAL_SIZE]; 29 | i = 0; 30 | } 31 | 32 | /** 33 | * Test program for demonstrating the Stemmer. It reads a file and stems 34 | * each word, writing the result to standard out. Usage: Stemmer file-name 35 | */ 36 | public static void main(String[] args) { 37 | PorterStemmer s = new PorterStemmer(); 38 | 39 | for (String arg : args) { 40 | try (InputStream in = new FileInputStream(arg)) { 41 | byte[] buffer = new byte[1024]; 42 | int bufferLen, offset, ch; 43 | 44 | bufferLen = in.read(buffer); 45 | offset = 0; 46 | s.reset(); 47 | 48 | while (true) { 49 | if (offset < bufferLen) ch = buffer[offset++]; 50 | else { 51 | bufferLen = in.read(buffer); 52 | offset = 0; 53 | if (bufferLen < 0) ch = -1; 54 | else ch = buffer[offset++]; 55 | } 56 | 57 | if (Character.isLetter((char) ch)) { 58 | s.add(Character.toLowerCase((char) ch)); 59 | } else { 60 | s.stem(); 61 | System.out.print(s.toString()); 62 | s.reset(); 63 | if (ch < 0) break; 64 | else { 65 | System.out.print((char) ch); 66 | } 67 | } 68 | } 69 | } catch (IOException e) { 70 | System.out.println("error reading " + arg); 71 | } 72 | } 73 | } 74 | 75 | /** 76 | * reset() resets the stemmer so it can stem another word. If you invoke the 77 | * stemmer by calling add(char) and then stem(), you must call reset() 78 | * before starting another word. 79 | */ 80 | public void reset() { 81 | i = 0; 82 | dirty = false; 83 | } 84 | 85 | /** 86 | * Add a character to the word being stemmed. When you are finished adding 87 | * characters, you can call stem(void) to process the word. 88 | */ 89 | public void add(char ch) { 90 | if (b.length <= i) { 91 | b = ArrayUtil.grow(b, i + 1); 92 | } 93 | b[i++] = ch; 94 | } 95 | 96 | /** 97 | * After a word has been stemmed, it can be retrieved by toString(), or a 98 | * reference to the internal buffer can be retrieved by getResultBuffer and 99 | * getResultLength (which is generally more efficient.) 100 | */ 101 | @Override 102 | public String toString() { 103 | return new String(b, 0, i); 104 | } 105 | 106 | /** 107 | * Returns the length of the word resulting from the stemming process. 108 | */ 109 | public int getResultLength() { 110 | return i; 111 | } 112 | 113 | /* cons(i) is true <=> b[i] is a consonant. */ 114 | 115 | /** 116 | * Returns a reference to a character buffer containing the results of the 117 | * stemming process. You also need to consult getResultLength() to determine 118 | * the length of the result. 119 | */ 120 | public char[] getResultBuffer() { 121 | return b; 122 | } 123 | 124 | /* 125 | * m() measures the number of consonant sequences between k0 and j. if c is 126 | * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary 127 | * presence, 128 | * 129 | * gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3 130 | * .... 131 | */ 132 | 133 | private boolean cons(int i) { 134 | switch (b[i]) { 135 | case 'a': 136 | case 'e': 137 | case 'i': 138 | case 'o': 139 | case 'u': 140 | return false; 141 | case 'y': 142 | return (i == k0) || !cons(i - 1); 143 | default: 144 | return true; 145 | } 146 | } 147 | 148 | /* vowelinstem() is true <=> k0,...j contains a vowel */ 149 | 150 | private int m() { 151 | int n = 0; 152 | int i = k0; 153 | while (true) { 154 | if (i > j) return n; 155 | if (!cons(i)) break; 156 | i++; 157 | } 158 | i++; 159 | while (true) { 160 | while (true) { 161 | if (i > j) return n; 162 | if (cons(i)) break; 163 | i++; 164 | } 165 | i++; 166 | n++; 167 | while (true) { 168 | if (i > j) return n; 169 | if (!cons(i)) break; 170 | i++; 171 | } 172 | i++; 173 | } 174 | } 175 | 176 | /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ 177 | 178 | private boolean vowelinstem() { 179 | int i; 180 | for (i = k0; i <= j; i++) 181 | if (!cons(i)) return true; 182 | return false; 183 | } 184 | 185 | /* 186 | * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant 187 | * and also if the second c is not w,x or y. this is used when trying to 188 | * restore an e at the end of a short word. e.g. 189 | * 190 | * cav(e), lov(e), hop(e), crim(e), but snow, box, tray. 191 | */ 192 | 193 | private boolean doublec(int j) { 194 | if (j < k0 + 1) return false; 195 | if (b[j] != b[j - 1]) return false; 196 | return cons(j); 197 | } 198 | 199 | private boolean cvc(int i) { 200 | if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) return false; 201 | else { 202 | int ch = b[i]; 203 | if (ch == 'w' || ch == 'x' || ch == 'y') return false; 204 | } 205 | return true; 206 | } 207 | 208 | /* 209 | * setto(s) sets (j+1),...k to the characters in the string s, readjusting 210 | * k. 211 | */ 212 | 213 | private boolean ends(String s) { 214 | int l = s.length(); 215 | int o = k - l + 1; 216 | if (o < k0) return false; 217 | for (int i = 0; i < l; i++) 218 | if (b[o + i] != s.charAt(i)) return false; 219 | j = k - l; 220 | return true; 221 | } 222 | 223 | /* r(s) is used further down. */ 224 | 225 | void setto(String s) { 226 | int l = s.length(); 227 | int o = j + 1; 228 | for (int i = 0; i < l; i++) 229 | b[o + i] = s.charAt(i); 230 | k = j + l; 231 | dirty = true; 232 | } 233 | 234 | /* 235 | * step1() gets rid of plurals and -ed or -ing. e.g. 236 | * 237 | * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat 238 | * 239 | * feed -> feed agreed -> agree disabled -> disable 240 | * 241 | * matting -> mat mating -> mate meeting -> meet milling -> mill messing -> 242 | * mess 243 | * 244 | * meetings -> meet 245 | */ 246 | 247 | void r(String s) { 248 | if (m() > 0) setto(s); 249 | } 250 | 251 | /* step2() turns terminal y to i when there is another vowel in the stem. */ 252 | 253 | private void step1() { 254 | if (b[k] == 's') { 255 | if (ends("sses")) k -= 2; 256 | else if (ends("ies")) setto("i"); 257 | else if (b[k - 1] != 's') k--; 258 | } 259 | if (ends("eed")) { 260 | if (m() > 0) k--; 261 | } else if ((ends("ed") || ends("ing")) && vowelinstem()) { 262 | k = j; 263 | if (ends("at")) setto("ate"); 264 | else if (ends("bl")) setto("ble"); 265 | else if (ends("iz")) setto("ize"); 266 | else if (doublec(k)) { 267 | int ch = b[k--]; 268 | if (ch == 'l' || ch == 's' || ch == 'z') k++; 269 | } else if (m() == 1 && cvc(k)) setto("e"); 270 | } 271 | } 272 | 273 | /* 274 | * step3() maps double suffices to single ones. so -ization ( = -ize plus 275 | * -ation) maps to -ize etc. note that the string before the suffix must 276 | * give m() > 0. 277 | */ 278 | 279 | private void step2() { 280 | if (ends("y") && vowelinstem()) { 281 | b[k] = 'i'; 282 | dirty = true; 283 | } 284 | } 285 | 286 | /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ 287 | 288 | private void step3() { 289 | if (k == k0) return; /* For Bug 1 */ 290 | switch (b[k - 1]) { 291 | case 'a': 292 | if (ends("ational")) { 293 | r("ate"); 294 | break; 295 | } 296 | if (ends("tional")) { 297 | r("tion"); 298 | break; 299 | } 300 | break; 301 | case 'c': 302 | if (ends("enci")) { 303 | r("ence"); 304 | break; 305 | } 306 | if (ends("anci")) { 307 | r("ance"); 308 | break; 309 | } 310 | break; 311 | case 'e': 312 | if (ends("izer")) { 313 | r("ize"); 314 | break; 315 | } 316 | break; 317 | case 'l': 318 | if (ends("bli")) { 319 | r("ble"); 320 | break; 321 | } 322 | if (ends("alli")) { 323 | r("al"); 324 | break; 325 | } 326 | if (ends("entli")) { 327 | r("ent"); 328 | break; 329 | } 330 | if (ends("eli")) { 331 | r("e"); 332 | break; 333 | } 334 | if (ends("ousli")) { 335 | r("ous"); 336 | break; 337 | } 338 | break; 339 | case 'o': 340 | if (ends("ization")) { 341 | r("ize"); 342 | break; 343 | } 344 | if (ends("ation")) { 345 | r("ate"); 346 | break; 347 | } 348 | if (ends("ator")) { 349 | r("ate"); 350 | break; 351 | } 352 | break; 353 | case 's': 354 | if (ends("alism")) { 355 | r("al"); 356 | break; 357 | } 358 | if (ends("iveness")) { 359 | r("ive"); 360 | break; 361 | } 362 | if (ends("fulness")) { 363 | r("ful"); 364 | break; 365 | } 366 | if (ends("ousness")) { 367 | r("ous"); 368 | break; 369 | } 370 | break; 371 | case 't': 372 | if (ends("aliti")) { 373 | r("al"); 374 | break; 375 | } 376 | if (ends("iviti")) { 377 | r("ive"); 378 | break; 379 | } 380 | if (ends("biliti")) { 381 | r("ble"); 382 | break; 383 | } 384 | break; 385 | case 'g': 386 | if (ends("logi")) { 387 | r("log"); 388 | break; 389 | } 390 | } 391 | } 392 | 393 | /* step5() takes off -ant, -ence etc., in context vcvc. */ 394 | 395 | private void step4() { 396 | switch (b[k]) { 397 | case 'e': 398 | if (ends("icate")) { 399 | r("ic"); 400 | break; 401 | } 402 | if (ends("ative")) { 403 | r(""); 404 | break; 405 | } 406 | if (ends("alize")) { 407 | r("al"); 408 | break; 409 | } 410 | break; 411 | case 'i': 412 | if (ends("iciti")) { 413 | r("ic"); 414 | break; 415 | } 416 | break; 417 | case 'l': 418 | if (ends("ical")) { 419 | r("ic"); 420 | break; 421 | } 422 | if (ends("ful")) { 423 | r(""); 424 | break; 425 | } 426 | break; 427 | case 's': 428 | if (ends("ness")) { 429 | r(""); 430 | break; 431 | } 432 | break; 433 | } 434 | } 435 | 436 | /* step6() removes a final -e if m() > 1. */ 437 | 438 | private void step5() { 439 | if (k == k0) return; /* for Bug 1 */ 440 | switch (b[k - 1]) { 441 | case 'a': 442 | if (ends("al")) break; 443 | return; 444 | case 'c': 445 | if (ends("ance")) break; 446 | if (ends("ence")) break; 447 | return; 448 | case 'e': 449 | if (ends("er")) break; 450 | return; 451 | case 'i': 452 | if (ends("ic")) break; 453 | return; 454 | case 'l': 455 | if (ends("able")) break; 456 | if (ends("ible")) break; 457 | return; 458 | case 'n': 459 | if (ends("ant")) break; 460 | if (ends("ement")) break; 461 | if (ends("ment")) break; 462 | /* element etc. not stripped before the m */ 463 | if (ends("ent")) break; 464 | return; 465 | case 'o': 466 | if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; 467 | /* j >= 0 fixes Bug 2 */ 468 | if (ends("ou")) break; 469 | return; 470 | /* takes care of -ous */ 471 | case 's': 472 | if (ends("ism")) break; 473 | return; 474 | case 't': 475 | if (ends("ate")) break; 476 | if (ends("iti")) break; 477 | return; 478 | case 'u': 479 | if (ends("ous")) break; 480 | return; 481 | case 'v': 482 | if (ends("ive")) break; 483 | return; 484 | case 'z': 485 | if (ends("ize")) break; 486 | return; 487 | default: 488 | return; 489 | } 490 | if (m() > 1) k = j; 491 | } 492 | 493 | private void step6() { 494 | j = k; 495 | if (b[k] == 'e') { 496 | int a = m(); 497 | if (a > 1 || a == 1 && !cvc(k - 1)) k--; 498 | } 499 | if (b[k] == 'l' && doublec(k) && m() > 1) k--; 500 | } 501 | 502 | /** 503 | * Stem a word provided as a String. Returns the result as a String. 504 | */ 505 | public String stem(String s) { 506 | if (stem(s.toCharArray(), s.length())) return toString(); 507 | else return s; 508 | } 509 | 510 | /** 511 | * Stem a word contained in a char[]. Returns true if the stemming process 512 | * resulted in a word different from the input. You can retrieve the result 513 | * with getResultLength()/getResultBuffer() or toString(). 514 | */ 515 | public boolean stem(char[] word) { 516 | return stem(word, word.length); 517 | } 518 | 519 | /** 520 | * Stem a word contained in a portion of a char[] array. Returns true if the 521 | * stemming process resulted in a word different from the input. You can 522 | * retrieve the result with getResultLength()/getResultBuffer() or 523 | * toString(). 524 | */ 525 | public boolean stem(char[] wordBuffer, int offset, int wordLen) { 526 | reset(); 527 | if (b.length < wordLen) { 528 | b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)]; 529 | } 530 | System.arraycopy(wordBuffer, offset, b, 0, wordLen); 531 | i = wordLen; 532 | return stem(0); 533 | } 534 | 535 | /** 536 | * Stem a word contained in a leading portion of a char[] array. Returns 537 | * true if the stemming process resulted in a word different from the input. 538 | * You can retrieve the result with getResultLength()/getResultBuffer() or 539 | * toString(). 540 | */ 541 | public boolean stem(char[] word, int wordLen) { 542 | return stem(word, 0, wordLen); 543 | } 544 | 545 | /** 546 | * Stem the word placed into the Stemmer buffer through calls to add(). 547 | * Returns true if the stemming process resulted in a word different from 548 | * the input. You can retrieve the result with 549 | * getResultLength()/getResultBuffer() or toString(). 550 | */ 551 | public boolean stem() { 552 | return stem(0); 553 | } 554 | 555 | public boolean stem(int i0) { 556 | k = i - 1; 557 | k0 = i0; 558 | if (k > k0 + 1) { 559 | step1(); 560 | step2(); 561 | step3(); 562 | step4(); 563 | step5(); 564 | step6(); 565 | } 566 | // Also, a word is considered dirty if we lopped off letters 567 | // Thanks to Ifigenia Vairelles for pointing this out. 568 | if (i != k + 1) dirty = true; 569 | i = k + 1; 570 | return dirty; 571 | } 572 | 573 | } 574 | -------------------------------------------------------------------------------- /src/main/java/com/hankcs/lucene/SegmentWrapper.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * hankcs 4 | * me@hankcs.com 5 | * 2015/10/6 18:51 6 | * 7 | * 8 | * Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/ 9 | * 10 | */ 11 | package com.hankcs.lucene; 12 | 13 | import com.hankcs.hanlp.seg.Segment; 14 | import com.hankcs.hanlp.seg.common.Term; 15 | 16 | import java.io.IOException; 17 | import java.io.Reader; 18 | import java.util.HashSet; 19 | import java.util.Iterator; 20 | import java.util.List; 21 | import java.util.Set; 22 | 23 | /** 24 | * 将分词器包装起来,每次输出一个token 25 | * 26 | * @author hankcs 27 | */ 28 | public class SegmentWrapper { 29 | /** 30 | * 缓冲区大小 31 | */ 32 | private static final int BUFFER_SIZE = 512; 33 | /** 34 | * 句子分隔符 35 | */ 36 | private static final Set delimiterCharSet = new HashSet() {{ 37 | add('\r'); 38 | add('\n'); 39 | add(';'); 40 | add(';'); 41 | add('。'); 42 | add('!'); 43 | add('!'); 44 | }}; 45 | /** 46 | * term的偏移量,由于wrapper是按行读取的,必须对term.offset做一个校正 47 | */ 48 | int offset; 49 | /** 50 | * 输入 51 | */ 52 | private Reader input; 53 | /** 54 | * 分词器 55 | */ 56 | private Segment segment; 57 | /** 58 | * 分词结果 59 | */ 60 | private Iterator iterator; 61 | /** 62 | * 缓冲区 63 | */ 64 | private char[] buffer = new char[BUFFER_SIZE]; 65 | /** 66 | * 缓冲区未处理的下标 67 | */ 68 | private int remainSize = 0; 69 | 70 | public SegmentWrapper(Reader reader, Segment segment) { 71 | this.input = reader; 72 | this.segment = segment; 73 | } 74 | 75 | /** 76 | * 重置分词器 77 | * 78 | * @param reader 79 | */ 80 | public void reset(Reader reader) { 81 | input = reader; 82 | offset = 0; 83 | iterator = null; 84 | } 85 | 86 | public Term next() throws IOException { 87 | if (iterator != null && iterator.hasNext()) return iterator.next(); 88 | String line = readLine(); 89 | if (line == null) return null; 90 | List termList = segment.seg(line); 91 | if (termList.size() == 0) return null; 92 | for (Term term : termList) { 93 | term.offset += offset; 94 | } 95 | offset += line.length(); 96 | iterator = termList.iterator(); 97 | return iterator.next(); 98 | } 99 | 100 | private String readLine() throws IOException { 101 | int offset = 0; 102 | int length = BUFFER_SIZE; 103 | if (remainSize > 0) { 104 | offset = remainSize; 105 | length -= remainSize; 106 | } 107 | int n = input.read(buffer, offset, length); 108 | if (n < 0) { 109 | if (remainSize != 0) { 110 | String lastLine = new String(buffer, 0, remainSize); 111 | remainSize = 0; 112 | return lastLine; 113 | } 114 | return null; 115 | } 116 | n += offset; 117 | 118 | int eos = lastIndexOfEos(buffer, n); 119 | String line = new String(buffer, 0, eos); 120 | remainSize = n - eos; 121 | System.arraycopy(buffer, eos, buffer, 0, remainSize); 122 | return line; 123 | } 124 | 125 | private int lastIndexOfEos(char[] buffer, int length) { 126 | for (int i = length - 1; i > 0; i--) { 127 | if (delimiterCharSet.contains(buffer[i])) { 128 | return i + 1; 129 | } 130 | } 131 | return length; 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/main/java/com/watt/CloudApplication.java: -------------------------------------------------------------------------------- 1 | package com.watt; 2 | 3 | import org.springframework.boot.SpringApplication; 4 | import org.springframework.boot.autoconfigure.SpringBootApplication; 5 | 6 | @SpringBootApplication 7 | public class CloudApplication { 8 | public static void main(String[] args) { 9 | SpringApplication.run(CloudApplication.class, args); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/com/watt/configure/DbConfig.java: -------------------------------------------------------------------------------- 1 | package com.watt.configure; 2 | 3 | import org.springframework.boot.context.properties.ConfigurationProperties; 4 | import org.springframework.context.annotation.Configuration; 5 | 6 | @Configuration 7 | @ConfigurationProperties(prefix = "db.mysql") 8 | public class DbConfig { 9 | private String driverClass; 10 | private String jdbcUrl; 11 | private String user; 12 | private String password; 13 | 14 | public String getDriverClass() { 15 | return driverClass; 16 | } 17 | 18 | public void setDriverClass(String driverClass) { 19 | this.driverClass = driverClass; 20 | } 21 | 22 | public String getJdbcUrl() { 23 | return jdbcUrl; 24 | } 25 | 26 | public void setJdbcUrl(String jdbcUrl) { 27 | this.jdbcUrl = jdbcUrl; 28 | } 29 | 30 | public String getUser() { 31 | return user; 32 | } 33 | 34 | public void setUser(String user) { 35 | this.user = user; 36 | } 37 | 38 | public String getPassword() { 39 | return password; 40 | } 41 | 42 | public void setPassword(String password) { 43 | this.password = password; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/com/watt/configure/LuceneConfig.java: -------------------------------------------------------------------------------- 1 | package com.watt.configure; 2 | 3 | import org.apache.lucene.index.DirectoryReader; 4 | import org.apache.lucene.search.IndexSearcher; 5 | import org.apache.lucene.store.Directory; 6 | import org.apache.lucene.store.FSDirectory; 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | import org.springframework.boot.context.properties.ConfigurationProperties; 10 | import org.springframework.context.annotation.Configuration; 11 | 12 | import java.io.IOException; 13 | import java.nio.file.FileSystems; 14 | 15 | @Configuration 16 | @ConfigurationProperties(prefix = "lucene") 17 | public class LuceneConfig { 18 | private static Directory directory; 19 | private String root; 20 | private static DirectoryReader reader; 21 | private final Logger logger = LoggerFactory.getLogger(this.getClass()); 22 | private String indexKey; 23 | private String vectorPath; 24 | private String tfidfPath; 25 | 26 | private Directory getDirectory() { 27 | try { 28 | directory = FSDirectory.open(FileSystems.getDefault().getPath(root)); 29 | } catch (IOException e) { 30 | logger.error("directory对象打开失败"); 31 | return null; 32 | } 33 | return directory; 34 | } 35 | 36 | public IndexSearcher getIndexSearcher() { 37 | Directory directory = getDirectory(); 38 | if (directory == null) { 39 | return null; 40 | } 41 | try { 42 | reader = DirectoryReader.open(directory); 43 | DirectoryReader tr = DirectoryReader.openIfChanged(reader); 44 | if (tr != null) { 45 | reader.close(); 46 | reader = tr; 47 | } 48 | 49 | return new IndexSearcher(reader); 50 | } catch (IOException e) { 51 | logger.error("indexReader打开失败,不能继续"); 52 | } 53 | return null; 54 | } 55 | 56 | public String getRoot() { 57 | return root; 58 | } 59 | 60 | public void setRoot(String root) { 61 | this.root = root; 62 | } 63 | 64 | public String getIndexKey() { 65 | return indexKey; 66 | } 67 | 68 | public void setIndexKey(String indexKey) { 69 | this.indexKey = indexKey; 70 | } 71 | 72 | public String getVectorPath() { 73 | return vectorPath; 74 | } 75 | 76 | public void setVectorPath(String vectorPath) { 77 | this.vectorPath = vectorPath; 78 | } 79 | 80 | public String getTfidfPath() { 81 | return tfidfPath; 82 | } 83 | 84 | public void setTfidfPath(String tfidfPath) { 85 | this.tfidfPath = tfidfPath; 86 | } 87 | } 88 | 89 | -------------------------------------------------------------------------------- /src/main/java/com/watt/configure/MybatisConfig.java: -------------------------------------------------------------------------------- 1 | package com.watt.configure; 2 | 3 | import org.springframework.boot.context.properties.ConfigurationProperties; 4 | import org.springframework.context.annotation.Configuration; 5 | 6 | @Configuration 7 | @ConfigurationProperties(prefix = "db.mybatis") 8 | public class MybatisConfig { 9 | private String mybatisXml; 10 | 11 | public String getMybatisXml() { 12 | return mybatisXml; 13 | } 14 | 15 | public void setMybatisXml(String mybatisXml) { 16 | this.mybatisXml = mybatisXml; 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/QuestionsIndex.java: -------------------------------------------------------------------------------- 1 | package com.watt.core; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import com.hankcs.hanlp.corpus.synonym.Synonym; 5 | import com.hankcs.hanlp.dictionary.CoreSynonymDictionary; 6 | import com.hankcs.hanlp.dictionary.common.CommonSynonymDictionary; 7 | import com.hankcs.hanlp.seg.Segment; 8 | import com.hankcs.hanlp.seg.common.Term; 9 | import com.hankcs.lucene.HanLPIndexAnalyzer; 10 | import com.watt.configure.LuceneConfig; 11 | import com.watt.mvc.service.QAService; 12 | import com.watt.util.FileUtils; 13 | import org.apache.lucene.document.Document; 14 | import org.apache.lucene.document.Field; 15 | import org.apache.lucene.document.TextField; 16 | import org.apache.lucene.index.IndexWriter; 17 | import org.apache.lucene.index.IndexWriterConfig; 18 | import org.apache.lucene.store.Directory; 19 | import org.apache.lucene.store.FSDirectory; 20 | import org.slf4j.Logger; 21 | import org.slf4j.LoggerFactory; 22 | import org.springframework.beans.factory.annotation.Autowired; 23 | import org.springframework.stereotype.Component; 24 | 25 | import java.io.IOException; 26 | import java.nio.file.FileSystems; 27 | import java.util.List; 28 | import java.util.Map; 29 | import java.util.concurrent.atomic.AtomicInteger; 30 | 31 | /** 32 | * lucene索引创建保存相关 33 | */ 34 | @Component 35 | public class QuestionsIndex { 36 | private final Logger logger = LoggerFactory.getLogger(this.getClass()); 37 | private final QAService qaService; 38 | private final LuceneConfig luceneConfig; 39 | private Segment segment = HanLP.newSegment(); 40 | 41 | @Autowired 42 | public QuestionsIndex(QAService qaService, LuceneConfig luceneConfig) { 43 | this.qaService = qaService; 44 | this.luceneConfig = luceneConfig; 45 | } 46 | 47 | /** 48 | * 初始化,所以库 49 | */ 50 | public void createIndex() { 51 | 52 | FileUtils.clearPath(luceneConfig.getRoot()); 53 | IndexWriter writer; 54 | Directory directory; 55 | IndexWriterConfig iwc = new IndexWriterConfig(new HanLPIndexAnalyzer()); 56 | //创建目录 directory 57 | try { 58 | directory = FSDirectory.open(FileSystems.getDefault().getPath(luceneConfig.getRoot())); 59 | writer = new IndexWriter(directory, iwc); 60 | } catch (IOException e) { 61 | logger.info("Lucene目录打开异常"); 62 | return; 63 | } 64 | 65 | int start = 0; 66 | int pageSize = 1000; 67 | List> words; 68 | AtomicInteger count = new AtomicInteger(); 69 | do { 70 | words = qaService.queryQuestions(start, pageSize); 71 | for (Map word : words) { 72 | if (word != null && !word.isEmpty()) { 73 | Document doc = new Document();//创建document 添加field field是document的子元素 74 | doc.add(new Field("questions", word.get("QUESTION"), TextField.TYPE_STORED)); 75 | doc.add(new Field(luceneConfig.getIndexKey(), addSynonymItems(word.get("QUESTION")), TextField.TYPE_STORED)); 76 | doc.add(new Field("key", word.get("KW_ID"), TextField.TYPE_STORED)); 77 | doc.add(new Field("questionID", word.get("QUESTION_ID"), TextField.TYPE_STORED)); 78 | try { 79 | writer.addDocument(doc); 80 | } catch (IOException e) { 81 | logger.info("写入所以出错QUESTION:" + word.get("QUESTION") + "\nID:" + word.get("KW_ID")); 82 | } 83 | count.getAndIncrement(); 84 | } 85 | } 86 | logger.info("成功加载问题库:" + count.doubleValue()); 87 | start += pageSize; 88 | } while (!words.isEmpty()); 89 | try { 90 | writer.close(); 91 | directory.close(); 92 | } catch (IOException e) { 93 | logger.info("Lucene目录和writer关闭异常"); 94 | } 95 | 96 | } 97 | 98 | private String addSynonymItems(String word) { 99 | List termList = segment.seg(word); 100 | StringBuffer result = new StringBuffer(); 101 | for (Term term : termList) { 102 | result.append(term.word); 103 | CommonSynonymDictionary.SynonymItem item = CoreSynonymDictionary.get(term.word); 104 | if (item != null && item.type == Synonym.Type.EQUAL && item.synonymList != null && !item.synonymList.isEmpty()) { 105 | for (Synonym synonym : item.synonymList) { 106 | result.append(synonym.realWord); 107 | } 108 | } 109 | } 110 | return result.toString(); 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/dictionary/CoreAbbreviationDictionary.java: -------------------------------------------------------------------------------- 1 | package com.watt.core.dictionary; 2 | 3 | import com.hankcs.hanlp.seg.common.Term; 4 | 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | /** 10 | * 全、简称词典核心类 11 | */ 12 | public class CoreAbbreviationDictionary { 13 | private static Map abbreviation = new HashMap<>(); 14 | 15 | public static String getAbbreviation(String abbr) { 16 | return abbreviation.get(abbr); 17 | } 18 | 19 | public static void addAbbreviation(String abbr, String full) { 20 | abbreviation.put(abbr, full); 21 | } 22 | 23 | /** 24 | * 将简称全部转为全称 25 | */ 26 | public static List convertAbbreviationToFull(List terms) { 27 | terms.forEach(term -> { 28 | String full = getAbbreviation(term.word); 29 | if (full != null) { 30 | term.word = full; 31 | } 32 | }); 33 | return terms; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/dictionary/CoreStopWordsDictionary.java: -------------------------------------------------------------------------------- 1 | package com.watt.core.dictionary; 2 | 3 | import com.hankcs.hanlp.seg.common.Term; 4 | 5 | import java.util.ArrayList; 6 | import java.util.HashSet; 7 | import java.util.List; 8 | import java.util.Set; 9 | 10 | public class CoreStopWordsDictionary { 11 | private static Set stopWords = new HashSet<>(); 12 | 13 | public static void addStopWord(String word) { 14 | stopWords.add(word); 15 | } 16 | 17 | private static boolean contains(String word) { 18 | return stopWords.contains(word); 19 | } 20 | 21 | /** 22 | * 去掉所有的停用词 23 | * 24 | * @return 去掉停用词后保留原始数据 25 | */ 26 | public static List removeStopWords(List terms) { 27 | List result = new ArrayList<>(); 28 | terms.forEach(term -> { 29 | if (!contains(term.word)) { 30 | result.add(term); 31 | } 32 | }); 33 | return result; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/dictionary/MyCustomDictionary.java: -------------------------------------------------------------------------------- 1 | package com.watt.core.dictionary; 2 | 3 | import com.hankcs.hanlp.dictionary.CustomDictionary; 4 | import com.watt.mvc.service.QAService; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.stereotype.Component; 9 | 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | /** 14 | * 词典相关的内容 15 | */ 16 | @Component 17 | public class MyCustomDictionary { 18 | private final Logger logger = LoggerFactory.getLogger(this.getClass()); 19 | private final QAService qaService; 20 | 21 | @Autowired 22 | public MyCustomDictionary(QAService qaService) { 23 | this.qaService = qaService; 24 | } 25 | 26 | public void initDictionary() { 27 | List> words = qaService.queryDictionaryAll(); 28 | words.forEach(word -> { 29 | if (word != null && !word.isEmpty()) { 30 | CustomDictionary.add(word.get("WORD")); 31 | } 32 | }); 33 | logger.info("共加载自定义词典:" + words.size()); 34 | } 35 | 36 | public void initCiLinSynonyms() { 37 | List synonyms = qaService.querySynonyms(); 38 | //CoreSynonymDictionary.reload(synonyms); 39 | logger.info("共加载同义词词汇:" + synonyms.size()); 40 | } 41 | 42 | public void initStopWords() { 43 | List stopWords = qaService.queryStopWordsAll(); 44 | stopWords.forEach(CoreStopWordsDictionary::addStopWord); 45 | logger.info("共加载停用词词典:" + stopWords.size()); 46 | } 47 | 48 | /** 49 | * 初始化全、简称词典,将简称全部加入到词典中去 50 | */ 51 | public void initAbbreviation() { 52 | List> abbreviations = qaService.queryAbbreviation(null); 53 | abbreviations.forEach(abbreviation -> { 54 | CustomDictionary.add(abbreviation.get("abbr_name")); 55 | CoreAbbreviationDictionary.addAbbreviation(abbreviation.get("abbr_name"), abbreviation.get("full_name")); 56 | }); 57 | logger.info("共加载全、简称词典:" + abbreviations.size()); 58 | } 59 | 60 | /** 61 | * 添加税务专用名词词典,永久生效 62 | * 63 | * @param word 词条或语料 64 | */ 65 | public void addTaxDictionaryWord(String word) { 66 | qaService.addTaxDictionaryWord(word); 67 | CustomDictionary.add(word); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/nlp/cosinesimlarity/AtomSegment.java: -------------------------------------------------------------------------------- 1 | package com.watt.core.nlp.cosinesimlarity; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class AtomSegment { 7 | public AtomSegment() { 8 | } 9 | 10 | public static String atomSegment(String sentence) { 11 | String atomSegResult = ""; 12 | Map wsWordMap = IDExtract.getLetters(sentence); 13 | Map mWordMap = IDExtract.getNumbers(sentence); 14 | Map wordsMap = new HashMap(); 15 | wordsMap.putAll(wsWordMap); 16 | wordsMap.putAll(mWordMap); 17 | int senLength = sentence.length(); 18 | 19 | for(int i = 0; i < senLength; ++i) { 20 | String word_i = (String)wordsMap.get(i); 21 | if (word_i == null) { 22 | word_i = sentence.charAt(i) + ""; 23 | wordsMap.put(i, word_i); 24 | } else { 25 | i += word_i.length() - 1; 26 | } 27 | 28 | atomSegResult = atomSegResult + " " + word_i; 29 | } 30 | 31 | return atomSegResult; 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/nlp/cosinesimlarity/IDExtract.java: -------------------------------------------------------------------------------- 1 | // 2 | // Source code recreated from a .class file by IntelliJ IDEA 3 | // (powered by Fernflower decompiler) 4 | // 5 | 6 | package com.watt.core.nlp.cosinesimlarity; 7 | 8 | import java.util.ArrayList; 9 | import java.util.Map; 10 | import java.util.TreeMap; 11 | import java.util.regex.Matcher; 12 | import java.util.regex.Pattern; 13 | 14 | public class IDExtract { 15 | public IDExtract() { 16 | } 17 | 18 | private static Map getStr(String regex_code, String param) { 19 | if (regex_code != null && !"".equals(regex_code)) { 20 | if (param != null && !"".equals(param)) { 21 | new ArrayList(); 22 | Map map = new TreeMap(); 23 | Pattern p = Pattern.compile(regex_code); 24 | Matcher m = p.matcher(param); 25 | 26 | while(m.find()) { 27 | map.put(m.start(), m.group()); 28 | } 29 | 30 | return map; 31 | } else { 32 | return null; 33 | } 34 | } else { 35 | return null; 36 | } 37 | } 38 | 39 | public static Map getLetters(String param) { 40 | String regex_code = "[A-Za-z]+"; 41 | return getStr(regex_code, param); 42 | } 43 | 44 | public static Map getNumbers(String param) { 45 | String regex_code = "(\\+|\\-)?\\d+(\\.\\d+)?"; 46 | return getStr(regex_code, param); 47 | } 48 | 49 | public static Map getEmail(String param) { 50 | String regex_code = "([a-zA-Z_]{1,}[0-9]{0,}@(([a-zA-z0-9]-*){1,}\\.){1,3}[a-zA-z\\-]{1,})|([1-9]\\d{4,10}@qq.com)"; 51 | return getStr(regex_code, param); 52 | } 53 | 54 | public static Map getMobile(String param) { 55 | String regex_code = "(? getTelNumber(String param) { 60 | String regex_code = "((\\(0[1-9][0-9]{1,2}\\))|((? getIDCard(String param) { 65 | String regex_code = "(\\d{6})(18|19|20)?((\\d{2})|(\\*\\*)|(XX)|(xx)|(\\?\\?)|(\\?\\?))(([01]\\d)|(\\*\\*)|(XX)|(xx)|(\\?\\?)|(\\?\\?))(([0123]\\d)|(\\*\\*)|(XX)|(xx)|(\\?\\?)|(\\?\\?))(((\\d{3})(\\d|X|x)?)|(\\*\\*\\*\\*)|(XXXX)|(xxxx)|(\\?\\?\\?\\?)|(\\?\\?\\?\\?))"; 66 | return getStr(regex_code, param); 67 | } 68 | 69 | public static Map getIPAddr(String param) { 70 | String regex_code = "(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])"; 71 | return getStr(regex_code, param); 72 | } 73 | 74 | public static Map getQQ(String param) { 75 | String regex_code = "[1-9]\\d{4,10}"; 76 | return getStr(regex_code, param); 77 | } 78 | 79 | public static Map getTime(String param) { 80 | String regex_code1 = "((\\d{4}-\\d{1,2}-\\d{1,2})|(\\d{2,4}\\u5E74\\d{1,2}\\u6708\\d{1,2}\\u65E5))"; 81 | String regex_code2 = "((\\d{2}:\\d{2}:\\d{2})|(((\\d{1,2}\\u65F6)|(\\d{1,2}\\u70b9))(\\d{1,2}\\u5206(\\d{1,2}\\u79D2)?)?))"; 82 | String regex_code3 = "((\\d{4}-\\d{1,2}-\\d{1,2})|(\\d{2,4}\\u5E74\\d{1,2}\\u6708\\d{1,2}\\u65E5)|(\\d{2,4}\\u5E74\\d{1,2}\\u6708)|\\d{1,2}\\u6708\\d{1,2}\\u65E5|\\d{2,4}\\u5E74|\\d{1,2}\\u6708|\\d{1,2}\\u65E5)"; 83 | String regex_code4 = "(\\d{2}:\\d{2}:\\d{2})|(\\d{2}:\\d{2})|(((\\d{1,2}\\u65F6)|(\\d{1,2}\\u70b9))\\d{1,2}\\u5206(\\d{1,2}\\u79D2)?)|(\\d{1,2}\\u65F6)"; 84 | String regex_code = regex_code1 + "(\\s)?" + regex_code2 + "|" + regex_code3 + "|" + regex_code4; 85 | return getStr(regex_code, param); 86 | } 87 | 88 | public static Map getCNTime(String param) { 89 | String regex_code1 = "(([零○一二两三四五六七八九十百千万亿]{2,}\\u5E74([一二三四五六七八九十]|(十(一|二)))\\u6708([一二三四五六七八九十]{1,})\\u65E5)((\\s)?((([零○一二两三四五六七八九十]{1,}\\u65F6)|([零○一二两三四五六七八九十]{1,}\\u70b9))[零○一二两三四五六七八九十]{1,}\\u5206[零○一二两三四五六七八九十]{1,}\\u79D2))*)"; 90 | String regex_code2 = "(([零○一二两三四五六七八九十百千万亿]{2,}\\u5E74([一二三四五六七八九十]|(十(一|二)))\\u6708([一二三四五六七八九十]{1,})\\u65E5)*((\\s)?((([零○一二两三四五六七八九十]{1,}\\u65F6)|([零○一二两三四五六七八九十]{1,}\\u70b9))[零○一二两三四五六七八九十]{1,}\\u5206[零○一二两三四五六七八九十]{1,}\\u79D2)))"; 91 | String regex_code3 = "((([零○一二两三四五六七八九十百千万亿]{2,}\\u5E74([一二三四五六七八九十]|(十(一|二)))\\u6708)|(([一二三四五六七八九十]|(十(一|二)))\\u6708([一二三四五六七八九十]{1,})\\u65E5)|([(零|○)一(二|两)三四五六七八九十百千万亿]{2,}\\u5E74)|(([一二三四五六七八九十]|(十(一|二)))\\u6708)|(([一二三四五六七八九十]{1,})\\u65E5)))"; 92 | String regex_code4 = "((([零○一二两三四五六七八九十]{1,}\\u65F6)|([零○一二两三四五六七八九十]{1,}\\u70b9))[零○一二两三四五六七八九十]{1,}\\u5206|(([零○一二两三四五六七八九十]{1,}\\u65F6)|([零○一二两三四五六七八九十]{1,}\\u70b9)))"; 93 | String regex_code = regex_code1 + "|" + regex_code2 + "|" + regex_code3 + "|" + regex_code4; 94 | return getStr(regex_code, param); 95 | } 96 | 97 | public static Map getURL(String param) { 98 | String regex_code = "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"; 99 | return getStr(regex_code, param); 100 | } 101 | 102 | public static Map getCarNum(String param) { 103 | String regex_code = "[京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z]{1}[A-Z]{1}[A-Z0-9]{4}[A-Z0-9挂学警港澳]{1}"; 104 | return getStr(regex_code, param); 105 | } 106 | 107 | public static Map getBankCard(String param) { 108 | String regex_code = "(\\d{16}|\\d{19})"; 109 | return getStr(regex_code, param); 110 | } 111 | 112 | public static Map getCNNum(String param) { 113 | String regex_code = "(第[零○一二两三四五六七八九十廿卅百千万亿]{1,})|((几|数)(十|百|千|万|(十万)|(百万)|(千万)|亿))|((成|上)(百|千|万|(十万)|(百万)|(千万)|亿))|([零○一二两三四五六七八九十廿卅百千万亿]{2,})|([零壹贰叁肆伍陆柒捌玖拾佰仟万亿]{2,})"; 114 | return getStr(regex_code, param); 115 | } 116 | 117 | public static Map getPerNum(String param) { 118 | String regex_code1 = "((\\d{1,})(\\.\\d{1,})?%|(百分之(([○零一二两三四五六七八九十廿卅百]{1,})(点[○零一二两三四五六七八九十廿卅]{1,})?)|((\\d{1,})(\\.\\d{1,})?)))"; 119 | String regex_code2 = "((\\d{1,})(\\.\\d{1,})?‰|(千分之(([○零一二两三四五六七八九十廿卅百千]{1,})(点[○零一二两三四五六七八九十廿卅]{1,})?))|((\\d{1,})(\\.\\d{1,})?))"; 120 | String regex_code3 = "((([○零一二两三四五六七八九十廿卅百千万亿]{1,})|(\\d{1,}))分之(([○零一二两三四五六七八九十廿卅百千万亿]{1,})|(\\d{1,})))"; 121 | String regex_code = regex_code3 + "|" + regex_code1 + "|" + regex_code2; 122 | return getStr(regex_code, param); 123 | } 124 | 125 | public static Map getFloatNum(String param) { 126 | String regex_code = "([1-9]\\d*\\.\\d*|0\\.\\d*[1-9]\\d*)"; 127 | return getStr(regex_code, param); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/nlp/cosinesimlarity/SimilarityAnalyze.java: -------------------------------------------------------------------------------- 1 | // 2 | // Source code recreated from a .class file by IntelliJ IDEA 3 | // (powered by Fernflower decompiler) 4 | // 5 | 6 | package com.watt.core.nlp.cosinesimlarity; 7 | 8 | import com.hankcs.hanlp.seg.common.Term; 9 | 10 | import java.io.IOException; 11 | import java.util.List; 12 | 13 | public abstract class SimilarityAnalyze { 14 | Word2Vec vec = new Word2Vec(); 15 | boolean loadModel; 16 | 17 | public Word2Vec getVec() { 18 | return vec; 19 | } 20 | 21 | public void setVec(Word2Vec vec) { 22 | this.vec = vec; 23 | } 24 | 25 | public void loadGoogleModel(String filePath) { 26 | try { 27 | this.vec.loadGoogleModel(filePath); 28 | } catch (IOException e) { 29 | e.printStackTrace(); 30 | } 31 | this.loadModel = true; 32 | } 33 | 34 | public void loadCommonModel(String filePath) { 35 | try { 36 | this.vec.loadCommonModel(filePath); 37 | } catch (IOException e) { 38 | e.printStackTrace(); 39 | } 40 | this.loadModel = true; 41 | } 42 | 43 | public void loadJavaModel(String filePath) { 44 | try { 45 | this.vec.loadJavaModel(filePath); 46 | } catch (IOException var3) { 47 | var3.printStackTrace(); 48 | } 49 | this.loadModel = true; 50 | } 51 | 52 | 53 | float[] getWordVector(String word) { 54 | return !this.loadModel ? null : this.vec.getWordVector(word); 55 | } 56 | 57 | /** 58 | * 计算两个向量之间的余弦相似度 59 | * 60 | * @param vec1 向量1 61 | * @param vec2 向量2 62 | * @return 相似度值 63 | */ 64 | double calCosine(float[] vec1, float[] vec2) { 65 | double dist = 0.0; 66 | double sum1 = 0.0; 67 | double sum2 = 0.0; 68 | if (vec1.length != vec2.length) { 69 | return dist; 70 | } 71 | for (int i = 0; i < vec1.length; ++i) { 72 | dist += vec1[i] * vec2[i]; 73 | sum1 += Math.pow(vec1[i], 2); 74 | sum2 += Math.pow(vec2[i], 2); 75 | } 76 | double result = dist / Math.sqrt(sum1 * sum2); 77 | //在计算过程中,向量一致的词由于算法的不完全匹配性,存在比较小的误差, 78 | //为避免大于1这种情况,对后续计算过程中的英雄,暂时的将相似度控制到100%以内 79 | return result > 1.0 ? 1.0D : result; 80 | } 81 | 82 | double calMaxSimilarity(String centerWord, List wordList) { 83 | double max = -1.0F; 84 | for (Term term : wordList) { 85 | if (term.word.equals(centerWord)) { 86 | return 1.0F; 87 | } 88 | } 89 | for (Term term : wordList) { 90 | double temp = this.wordSimilarity(centerWord, term.word); 91 | if (temp != 0.0F && temp > max) { 92 | max = temp; 93 | } 94 | } 95 | 96 | if (max == -1.0F) { 97 | return 0.0F; 98 | } else { 99 | return max; 100 | } 101 | 102 | } 103 | 104 | public abstract double wordSimilarity(String word1, String word2); 105 | 106 | public abstract double sentenceSimilarity(List sentence1Words, List sentence2Words); 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/nlp/cosinesimlarity/SimilarityAnalyzeUnfamiliarWords.java: -------------------------------------------------------------------------------- 1 | package com.watt.core.nlp.cosinesimlarity; 2 | 3 | import com.hankcs.hanlp.seg.common.Term; 4 | 5 | import java.io.IOException; 6 | import java.util.List; 7 | 8 | public class SimilarityAnalyzeUnfamiliarWords extends SimilarityAnalyze { 9 | private Word2Vec charVec; 10 | private int dimension = 200; 11 | 12 | public SimilarityAnalyzeUnfamiliarWords() { 13 | this.vec = new Word2Vec(); 14 | this.charVec = new Word2Vec(); 15 | this.loadModel = false; 16 | } 17 | 18 | public void loadCharJavaModel(String modelPath) { 19 | try { 20 | this.charVec.loadCommonModel(modelPath); 21 | } catch (IOException var3) { 22 | var3.printStackTrace(); 23 | } 24 | 25 | this.loadModel = true; 26 | } 27 | 28 | public double wordSimilarity(String word1, String word2) { 29 | if (!this.loadModel) { 30 | return 0.0F; 31 | } else { 32 | float[] word1Vec = this.getWordVector(word1); 33 | float[] word2Vec = this.getWordVector(word2); 34 | if (word1Vec == null) { 35 | word1Vec = this.getNullVec(word1); 36 | } 37 | if (word2Vec == null) { 38 | word2Vec = this.getNullVec(word2); 39 | } 40 | 41 | return this.calCosine(word1Vec, word2Vec); 42 | } 43 | } 44 | 45 | private float[] getCharVector(String word) { 46 | return !this.loadModel ? null : this.charVec.getWordVector(word); 47 | } 48 | 49 | private float[] getNullVec(String word) { 50 | float[] nullVec = new float[this.dimension]; 51 | int count = 0; 52 | String atomSegment = AtomSegment.atomSegment(word).trim(); 53 | String[] atomSegmentStr = atomSegment.split("\\s+"); 54 | int i; 55 | for (i = 0; i < atomSegmentStr.length; ++i) { 56 | if (this.getCharVector(atomSegmentStr[i]) != null) { 57 | ++count; 58 | 59 | for (int j = 0; j < this.dimension; ++j) { 60 | nullVec[j] += this.getCharVector(atomSegmentStr[i])[j]; 61 | } 62 | } 63 | } 64 | 65 | if (0 != count && 1 != count) { 66 | for (i = 0; i < this.dimension; ++i) { 67 | nullVec[i] /= (float) count; 68 | } 69 | 70 | return nullVec; 71 | } else { 72 | return nullVec; 73 | } 74 | } 75 | 76 | /** 77 | * 句子相似度计算 78 | * @param sentence1Words 分词之后的文本一 79 | * @param sentence2Words 分词之后的文本二 80 | * @return 相似度评分(0-1之间的概率) 81 | */ 82 | public double sentenceSimilarity(List sentence1Words, List sentence2Words) { 83 | if (!this.loadModel) { 84 | return 0.0F; 85 | } else if (!sentence1Words.isEmpty() && !sentence2Words.isEmpty()) { 86 | float sum1 = 0.0F; 87 | float sum2 = 0.0F; 88 | int count1 = 0; 89 | int count2 = 0; 90 | //计算第一句话得每一个词和另一句话中最相似的词的相似度 91 | for (Term sentence1Word : sentence1Words) { 92 | ++count1; 93 | sum1 += this.calMaxSimilarity(sentence1Word.word, sentence2Words); 94 | } 95 | //计算第二句话得每一个词和另一句话中最相似的词的相似度 96 | for (Term sentence2Word : sentence2Words) { 97 | ++count2; 98 | sum2 += this.calMaxSimilarity(sentence2Word.word, sentence1Words); 99 | } 100 | //检测数量是不是为0是为了避免计算过程中产生NAN导致报错 101 | if (count1 == 0) { 102 | if (count2 == 0) { 103 | return 0F; 104 | } else { 105 | return sum2 / count2; 106 | } 107 | } else if (count2 == 0) { 108 | return sum1 / count1; 109 | } 110 | //去相似度最小的那个,能够避免长短文本比较而产生文本包含关系的问题 111 | return Math.min(sum1 / (count1), sum2 / count2); 112 | } else { 113 | return 0.0F; 114 | } 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/main/java/com/watt/core/nlp/cosinesimlarity/Word2Vec.java: -------------------------------------------------------------------------------- 1 | package com.watt.core.nlp.cosinesimlarity; 2 | 3 | import java.io.*; 4 | import java.nio.charset.StandardCharsets; 5 | import java.util.HashMap; 6 | 7 | public class Word2Vec { 8 | 9 | private static final int MAX_SIZE = 50; 10 | private HashMap wordMap = new HashMap<>(); 11 | private int words; 12 | private int size; 13 | 14 | private static float readFloat(InputStream is) throws IOException { 15 | byte[] bytes = new byte[4]; 16 | is.read(bytes); 17 | return getFloat(bytes); 18 | } 19 | 20 | /** 21 | * 读取一个float 22 | */ 23 | private static float getFloat(byte[] b) { 24 | int accum = 0; 25 | accum = accum | (b[0] & 0xff) << 0; 26 | accum = accum | (b[1] & 0xff) << 8; 27 | accum = accum | (b[2] & 0xff) << 16; 28 | accum = accum | (b[3] & 0xff) << 24; 29 | return Float.intBitsToFloat(accum); 30 | } 31 | 32 | /** 33 | * 读取一个字符串 34 | */ 35 | private static String readString(DataInputStream dis) throws IOException { 36 | byte[] bytes = new byte[MAX_SIZE]; 37 | byte b = dis.readByte(); 38 | int i = -1; 39 | StringBuilder sb = new StringBuilder(); 40 | while (b != 32 && b != 10) { 41 | i++; 42 | bytes[i] = b; 43 | b = dis.readByte(); 44 | if (i == 49) { 45 | sb.append(new String(bytes)); 46 | i = -1; 47 | bytes = new byte[MAX_SIZE]; 48 | } 49 | } 50 | String s = new String(bytes, 0, i + 1, StandardCharsets.UTF_8); 51 | sb.append(s); 52 | return sb.toString(); 53 | } 54 | 55 | /** 56 | * 加载模型 57 | * 58 | * @param path 模型的路径 59 | */ 60 | void loadGoogleModel(String path) throws IOException { 61 | DataInputStream dis = null; 62 | BufferedInputStream bis = null; 63 | double len = 0; 64 | float vector = 0; 65 | bis = new BufferedInputStream(new FileInputStream(path)); 66 | dis = new DataInputStream(bis); 67 | // //读取词数 68 | words = Integer.parseInt(readString(dis)); 69 | // //大小 70 | size = Integer.parseInt(readString(dis)); 71 | String word; 72 | float[] vectors = null; 73 | for (int i = 0; i < words; i++) { 74 | word = readString(dis); 75 | vectors = new float[size]; 76 | len = 0; 77 | for (int j = 0; j < size; j++) { 78 | vector = readFloat(dis); 79 | len += vector * vector; 80 | vectors[j] = (float) vector; 81 | } 82 | len = Math.sqrt(len); 83 | 84 | for (int j = 0; j < size; j++) { 85 | vectors[j] /= len; 86 | } 87 | 88 | wordMap.put(word, vectors); 89 | dis.read(); 90 | } 91 | bis.close(); 92 | dis.close(); 93 | } 94 | 95 | /** 96 | * 加载模型 97 | * 98 | * @param path 模型的路径 99 | * @throws IOException 文件找不到时会抛出异常 100 | */ 101 | void loadCommonModel(String path) throws IOException { 102 | BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(path)))); 103 | String wordLine = null; 104 | //将第一行的文本省略掉,第一行分别是词数量和纬度,是不需要记录到加载内容中的 105 | reader.readLine(); 106 | while ((wordLine = reader.readLine()) != null) { 107 | String[] split = wordLine.trim().split("\\s+"); 108 | String key = ""; 109 | float[] value = new float[split.length - 1]; 110 | for (int i = 0; i < split.length; i++) { 111 | if (i == 0) { 112 | key = split[0]; 113 | } else { 114 | value[i - 1] = Float.parseFloat(split[i]); 115 | } 116 | } 117 | wordMap.put(key, value); 118 | } 119 | } 120 | 121 | void loadJavaModel(String path) throws IOException { 122 | try (DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path)))) { 123 | words = dis.readInt(); 124 | size = dis.readInt(); 125 | 126 | float vector = 0; 127 | 128 | String key = null; 129 | float[] value = null; 130 | for (int i = 0; i < words; i++) { 131 | double len = 0; 132 | key = dis.readUTF(); 133 | value = new float[size]; 134 | for (int j = 0; j < size; j++) { 135 | vector = dis.readFloat(); 136 | len += vector * vector; 137 | value[j] = vector; 138 | } 139 | 140 | len = Math.sqrt(len); 141 | 142 | for (int j = 0; j < size; j++) { 143 | value[j] /= len; 144 | } 145 | wordMap.put(key, value); 146 | } 147 | 148 | } 149 | } 150 | 151 | private float[] sum(float[] center, float[] fs) { 152 | 153 | if (center == null && fs == null) { 154 | return null; 155 | } 156 | 157 | if (fs == null) { 158 | return center; 159 | } 160 | 161 | if (center == null) { 162 | return fs; 163 | } 164 | 165 | for (int i = 0; i < fs.length; i++) { 166 | center[i] += fs[i]; 167 | } 168 | 169 | return center; 170 | } 171 | 172 | /** 173 | * 得到词向量 174 | */ 175 | public float[] getWordVector(String word) { 176 | return wordMap.get(word); 177 | } 178 | 179 | /** 180 | * 设置词向量 181 | */ 182 | public void setWordVector(String word, float[] value) { 183 | wordMap.put(word, value); 184 | } 185 | 186 | public HashMap getWordMap() { 187 | return wordMap; 188 | } 189 | 190 | public int getSize() { 191 | return size; 192 | } 193 | 194 | } 195 | -------------------------------------------------------------------------------- /src/main/java/com/watt/data/jdbc/MySqlDataSource.java: -------------------------------------------------------------------------------- 1 | package com.watt.data.jdbc; 2 | 3 | import com.mchange.v2.c3p0.ComboPooledDataSource; 4 | import com.watt.configure.DbConfig; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.context.annotation.Bean; 9 | import org.springframework.context.annotation.Configuration; 10 | 11 | import java.beans.PropertyVetoException; 12 | 13 | @Configuration 14 | public class MySqlDataSource { 15 | private final Logger logger = LoggerFactory.getLogger(this.getClass()); 16 | private DbConfig dbConfig; 17 | 18 | @Autowired 19 | public MySqlDataSource(DbConfig dbConfig) { 20 | logger.info("MySqlDataSource OK"); 21 | this.dbConfig = dbConfig; 22 | } 23 | 24 | @Bean 25 | public ComboPooledDataSource getDataSource() { 26 | ComboPooledDataSource dataSource = new ComboPooledDataSource(); 27 | try { 28 | dataSource.setDriverClass(dbConfig.getDriverClass()); 29 | dataSource.setJdbcUrl(dbConfig.getJdbcUrl()); 30 | dataSource.setUser(dbConfig.getUser()); 31 | dataSource.setPassword(dbConfig.getPassword()); 32 | dataSource.setMinPoolSize(1); 33 | dataSource.setMaxPoolSize(2); 34 | dataSource.setInitialPoolSize(1); 35 | dataSource.setMaxIdleTime(180); 36 | dataSource.setAcquireRetryAttempts(30); 37 | return dataSource; 38 | } catch (PropertyVetoException e) { 39 | e.printStackTrace(); 40 | } 41 | return null; 42 | } 43 | } 44 | 45 | -------------------------------------------------------------------------------- /src/main/java/com/watt/data/jdbc/MySqlSessionFactoryBean.java: -------------------------------------------------------------------------------- 1 | package com.watt.data.jdbc; 2 | 3 | import com.watt.configure.MybatisConfig; 4 | import org.mybatis.spring.SqlSessionFactoryBean; 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | import org.springframework.beans.factory.annotation.Autowired; 8 | import org.springframework.context.annotation.Bean; 9 | import org.springframework.context.annotation.Configuration; 10 | import org.springframework.core.io.ClassPathResource; 11 | 12 | @Configuration 13 | public class MySqlSessionFactoryBean { 14 | private final Logger logger = LoggerFactory.getLogger(this.getClass()); 15 | private MySqlDataSource dataSource; 16 | private MybatisConfig mybatisConfig; 17 | 18 | @Autowired 19 | public MySqlSessionFactoryBean(MySqlDataSource dataSource, MybatisConfig mybatisConfig) { 20 | logger.info("MySqlSessionFactoryBean OK"); 21 | this.dataSource = dataSource; 22 | this.mybatisConfig = mybatisConfig; 23 | } 24 | 25 | @Bean 26 | public SqlSessionFactoryBean getSqlSessionFactoryBean() { 27 | SqlSessionFactoryBean sqlSessionFactoryBean = new SqlSessionFactoryBean(); 28 | sqlSessionFactoryBean.setDataSource(dataSource.getDataSource()); 29 | sqlSessionFactoryBean.setConfigLocation(new ClassPathResource(mybatisConfig.getMybatisXml())); 30 | return sqlSessionFactoryBean; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/com/watt/data/jdbc/MySqlSessionTemplate.java: -------------------------------------------------------------------------------- 1 | package com.watt.data.jdbc; 2 | 3 | import org.mybatis.spring.SqlSessionTemplate; 4 | import org.slf4j.Logger; 5 | import org.slf4j.LoggerFactory; 6 | import org.springframework.beans.factory.annotation.Autowired; 7 | import org.springframework.context.annotation.Bean; 8 | import org.springframework.context.annotation.Configuration; 9 | 10 | @Configuration 11 | public class MySqlSessionTemplate { 12 | private static SqlSessionTemplate sqlSessionTemplate; 13 | private final Logger logger = LoggerFactory.getLogger(this.getClass()); 14 | 15 | private MySqlSessionFactoryBean mySqlSessionFactoryBean; 16 | 17 | @Autowired 18 | public MySqlSessionTemplate(MySqlSessionFactoryBean mySqlSessionFactoryBean) { 19 | logger.info("MySqlSessionTemplate OK"); 20 | this.mySqlSessionFactoryBean = mySqlSessionFactoryBean; 21 | } 22 | 23 | @Bean 24 | public SqlSessionTemplate getSqlSessionTemplate() { 25 | try { 26 | if (sqlSessionTemplate == null) { 27 | sqlSessionTemplate = new SqlSessionTemplate(mySqlSessionFactoryBean.getSqlSessionFactoryBean().getObject()); 28 | } 29 | return sqlSessionTemplate; 30 | } catch (Exception e) { 31 | e.printStackTrace(); 32 | } 33 | return null; 34 | } 35 | 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/watt/mvc/beans/CheckResult.java: -------------------------------------------------------------------------------- 1 | package com.watt.mvc.beans; 2 | 3 | public class CheckResult { 4 | private String code; 5 | private String message; 6 | private String content; 7 | 8 | public CheckResult(String code, String message, String content) { 9 | this.code = code; 10 | this.message = message; 11 | this.content = content; 12 | } 13 | 14 | public String getCode() { 15 | return code; 16 | } 17 | 18 | public void setCode(String code) { 19 | this.code = code; 20 | } 21 | 22 | public String getMessage() { 23 | return message; 24 | } 25 | 26 | public void setMessage(String message) { 27 | this.message = message; 28 | } 29 | 30 | public String getContent() { 31 | return content; 32 | } 33 | 34 | public void setContent(String content) { 35 | this.content = content; 36 | } 37 | 38 | @Override 39 | public String toString() { 40 | return "CheckResult{" + 41 | "code='" + code + '\'' + 42 | ", message='" + message + '\'' + 43 | ", content='" + content + '\'' + 44 | '}'; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/com/watt/mvc/beans/PlatformResponse.java: -------------------------------------------------------------------------------- 1 | package com.watt.mvc.beans; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import com.alibaba.fastjson.annotation.JSONField; 5 | 6 | 7 | public class PlatformResponse { 8 | @JSONField(name = "match") 9 | private String match; 10 | @JSONField(name = "question") 11 | private String question; 12 | @JSONField(name = "score") 13 | private double score; 14 | @JSONField(name = "list") 15 | private JSONArray list; 16 | @JSONField(name = "answer") 17 | private String answer; 18 | @JSONField(name = "answer_type") 19 | private String answer_type; 20 | @JSONField(name = "ref_id") 21 | private String ref_id; 22 | @JSONField(name = "key") 23 | private String key; 24 | @JSONField(name = "scene_end") 25 | private String scene_end; 26 | @JSONField(name = "user_id") 27 | private String user_id; 28 | 29 | 30 | @JSONField(name = "media_url") 31 | private String media_url; 32 | 33 | public PlatformResponse() { 34 | } 35 | 36 | public PlatformResponse(String match, String question, double score, JSONArray list, String answer, String key, String answer_type, String ref_id, String scene_end, String user_id,String media_url) { 37 | this.match = match; 38 | this.question = question; 39 | this.score = score; 40 | this.list = list; 41 | this.answer = answer; 42 | this.key = key; 43 | this.answer_type = answer_type; 44 | this.ref_id = ref_id; 45 | this.scene_end = scene_end; 46 | this.user_id = user_id; 47 | this.media_url = media_url; 48 | } 49 | 50 | public String getKey() { 51 | return key; 52 | } 53 | 54 | public void setKey(String key) { 55 | this.key = key; 56 | } 57 | 58 | public String getMatch() { 59 | return match; 60 | } 61 | 62 | public void setMatch(String match) { 63 | this.match = match; 64 | } 65 | 66 | public String getQuestion() { 67 | return question; 68 | } 69 | 70 | public void setQuestion(String question) { 71 | this.question = question; 72 | } 73 | 74 | public double getScore() { 75 | return score; 76 | } 77 | 78 | public void setScore(double score) { 79 | this.score = score; 80 | } 81 | 82 | public JSONArray getList() { 83 | return list; 84 | } 85 | 86 | public void setList(JSONArray list) { 87 | this.list = list; 88 | } 89 | 90 | public String getAnswer() { 91 | return answer; 92 | } 93 | 94 | public void setAnswer(String answer) { 95 | this.answer = answer; 96 | } 97 | 98 | public String getScene_end() { 99 | return scene_end; 100 | } 101 | 102 | public void setScene_end(String scene_end) { 103 | this.scene_end = scene_end; 104 | } 105 | 106 | public String getUser_id() { 107 | return user_id; 108 | } 109 | 110 | public void setUser_id(String user_id) { 111 | this.user_id = user_id; 112 | } 113 | 114 | 115 | public String getMedia_url() { 116 | return media_url; 117 | } 118 | 119 | public void setMedia_url(String media_url) { 120 | this.media_url = media_url; 121 | } 122 | 123 | @Override 124 | public String toString() { 125 | return "PlatformResponse{" + 126 | "match='" + match + '\'' + 127 | ", question='" + question + '\'' + 128 | ", score=" + score + 129 | ", list='" + list + '\'' + 130 | ", answer='" + answer + '\'' + 131 | ", key='" + key + '\'' + 132 | ", ref_id='" + ref_id + '\'' + 133 | ", scene_end='" + scene_end + '\'' + 134 | '}'; 135 | } 136 | 137 | public String getAnswer_type() { 138 | return answer_type; 139 | } 140 | 141 | public void setAnswer_type(String answer_type) { 142 | this.answer_type = answer_type; 143 | } 144 | 145 | public String getRef_id() { 146 | return ref_id; 147 | } 148 | 149 | public void setRef_id(String ref_id) { 150 | this.ref_id = ref_id; 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/main/java/com/watt/mvc/beans/QAAnalyzeResult.java: -------------------------------------------------------------------------------- 1 | package com.watt.mvc.beans; 2 | 3 | public class QAAnalyzeResult { 4 | private double score; 5 | private String key; 6 | private String match; 7 | 8 | public QAAnalyzeResult() { 9 | } 10 | 11 | public QAAnalyzeResult(double score, String key, String match) { 12 | this.score = score; 13 | this.key = key; 14 | this.match = match; 15 | } 16 | 17 | public double getScore() { 18 | return score; 19 | } 20 | 21 | public void setScore(double score) { 22 | this.score = score; 23 | } 24 | 25 | public String getKey() { 26 | return key; 27 | } 28 | 29 | public void setKey(String key) { 30 | this.key = key; 31 | } 32 | 33 | public String getMatch() { 34 | return match; 35 | } 36 | 37 | public void setMatch(String match) { 38 | this.match = match; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/com/watt/mvc/controller/CorpusController.java: -------------------------------------------------------------------------------- 1 | package com.watt.mvc.controller; 2 | 3 | import com.watt.util.FileUtils; 4 | import com.watt.util.NLPUtils; 5 | import org.springframework.web.bind.annotation.RequestMapping; 6 | import org.springframework.web.bind.annotation.RequestParam; 7 | import org.springframework.web.bind.annotation.RestController; 8 | 9 | import java.io.File; 10 | import java.io.IOException; 11 | import java.util.List; 12 | 13 | @RestController 14 | public class CorpusController { 15 | /** 16 | * 预处理数据 17 | */ 18 | @RequestMapping("/loadCorpus") 19 | public String loadCorpus(@RequestParam(name = "path") String path) { 20 | List files = FileUtils.listFiles(path); 21 | files.forEach(file -> { 22 | try { 23 | NLPUtils.textPreprocessing(file, file.getParent() + "/dump/" + file.getName() + ".seg.txt"); 24 | } catch (IOException e) { 25 | e.printStackTrace(); 26 | } 27 | }); 28 | return "success"; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/watt/mvc/controller/QAController.java: -------------------------------------------------------------------------------- 1 | package com.watt.mvc.controller; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import com.hankcs.hanlp.HanLP; 5 | import com.hankcs.hanlp.seg.Segment; 6 | import com.hankcs.hanlp.seg.common.Term; 7 | import com.hankcs.hanlp.tokenizer.NotionalTokenizer; 8 | import com.hankcs.lucene.HanLPAnalyzer; 9 | import com.watt.configure.LuceneConfig; 10 | import com.watt.core.QuestionsIndex; 11 | import com.watt.core.dictionary.CoreAbbreviationDictionary; 12 | import com.watt.core.dictionary.CoreStopWordsDictionary; 13 | import com.watt.core.dictionary.MyCustomDictionary; 14 | import com.watt.core.nlp.cosinesimlarity.SimilarityAnalyze; 15 | import com.watt.core.nlp.cosinesimlarity.SimilarityAnalyzeUnfamiliarWords; 16 | import com.watt.mvc.beans.CheckResult; 17 | import com.watt.mvc.beans.PlatformResponse; 18 | import com.watt.mvc.beans.QAAnalyzeResult; 19 | import com.watt.mvc.service.QAService; 20 | import com.watt.util.CommonUtils; 21 | import org.apache.lucene.document.Document; 22 | import org.apache.lucene.queryparser.classic.ParseException; 23 | import org.apache.lucene.queryparser.classic.QueryParser; 24 | import org.apache.lucene.search.IndexSearcher; 25 | import org.apache.lucene.search.Query; 26 | import org.apache.lucene.search.ScoreDoc; 27 | import org.apache.lucene.search.TopDocs; 28 | import org.slf4j.Logger; 29 | import org.slf4j.LoggerFactory; 30 | import org.springframework.beans.factory.annotation.Autowired; 31 | import org.springframework.web.bind.annotation.RequestMapping; 32 | import org.springframework.web.bind.annotation.RestController; 33 | 34 | import javax.servlet.http.HttpServletRequest; 35 | import java.io.File; 36 | import java.io.FileInputStream; 37 | import java.io.IOException; 38 | import java.io.ObjectInputStream; 39 | import java.util.HashMap; 40 | import java.util.List; 41 | import java.util.Map; 42 | 43 | /** 44 | * 对话实现类 45 | */ 46 | @RestController 47 | public class QAController { 48 | private final Logger logger = LoggerFactory.getLogger(this.getClass()); 49 | private LuceneConfig luceneConfig; 50 | private QuestionsIndex questionsIndex; 51 | private Segment segment; 52 | private SimilarityAnalyze similarAnalyze = new SimilarityAnalyzeUnfamiliarWords(); 53 | private QAService qaService; 54 | private MyCustomDictionary myCustomDictionary; 55 | private Map tfidf = null; 56 | @Autowired 57 | public QAController(LuceneConfig luceneConfig, QuestionsIndex questionsIndex, QAService qaService, MyCustomDictionary myCustomDictionary) { 58 | this.luceneConfig = luceneConfig; 59 | this.questionsIndex = questionsIndex; 60 | this.qaService = qaService; 61 | this.myCustomDictionary = myCustomDictionary; 62 | //加载分词热词 63 | myCustomDictionary.initDictionary(); 64 | //加载停用词词典 65 | myCustomDictionary.initStopWords(); 66 | //加载全、简称词典将简称字段加入到分词热词中(全称不加入) 67 | myCustomDictionary.initAbbreviation(); 68 | //将所有的向量加载 69 | initWordVectors(); 70 | //1.词向量矫正 2.检索所有维护的词林同义词词典 3.并将所有的税务同义词加入到热词中 71 | initCilin(); 72 | //初始化分词服务 73 | initSeg(); 74 | //初始化tfidf模型 75 | initTfidf(); 76 | } 77 | 78 | /** 79 | * 初始化分词服务 80 | */ 81 | private void initSeg() { 82 | segment = HanLP.newSegment(); 83 | NotionalTokenizer.SEGMENT = segment; 84 | } 85 | 86 | /** 87 | * 初始化加载词向量 88 | */ 89 | private void initWordVectors() { 90 | similarAnalyze.loadGoogleModel(luceneConfig.getVectorPath()); 91 | logger.info("词向量加载完成"); 92 | } 93 | 94 | private void initTfidf(){ 95 | try { 96 | ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File(luceneConfig.getTfidfPath()))); 97 | tfidf = (HashMap)in.readObject(); 98 | }catch (Exception e){ 99 | logger.error("tfidf模型没有加载成功"); 100 | } 101 | } 102 | 103 | /** 104 | * 加载词林词典将词林同义词赋值到向量中 105 | */ 106 | private void initCilin() { 107 | //查询type类型为同义词的所有词汇,去掉like的词语 108 | //通过同义词、和专业词词典来重新校准向量值 109 | List> lines = qaService.querySynonymsAll("="); 110 | lines.forEach(line -> { 111 | String[] synonyms = line.get("synonym").trim().split("\\s+"); 112 | for (String synonym : synonyms) { 113 | float[] vector = similarAnalyze.getVec().getWordVector(synonym); 114 | if (vector != null) { 115 | for (String synonym1 : synonyms) { 116 | similarAnalyze.getVec().setWordVector(synonym1, vector); 117 | //如果存在自定义的同义词需要在此处加入到用户自定义分词词典中去 118 | } 119 | break; 120 | } 121 | } 122 | }); 123 | logger.info("同义词词林校准向量加载完成:" + lines.size()); 124 | } 125 | 126 | /** 127 | * 对话接口提供方法,分发多轮还是问答 128 | */ 129 | @RequestMapping("/getAnswer") 130 | public PlatformResponse query(HttpServletRequest request) throws IOException, ParseException { 131 | String question = request.getParameter("question").trim().replaceAll("\\s*", ""); 132 | JSONArray resultArray = this.searchAndCalculate(question); 133 | if (resultArray.isEmpty()) { 134 | return new PlatformResponse(); 135 | } 136 | //将分析的TOP最高的问题进行评估,返回一个评分最高的答案 137 | resultArray = CommonUtils.arrayCompare(resultArray); 138 | QAAnalyzeResult qaAnalyzeResult = resultArray.getObject(0, QAAnalyzeResult.class); 139 | Map answer = qaService.queryAnswer(qaAnalyzeResult.getKey()); 140 | String media_type = answer.get("MEDIA_TYPE"); 141 | String media_url = media_type.equals("IMG") || media_type.equals("GT") ? 142 | qaService.queryMediaUrlByREF_ID(answer.get("REF_ID")) 143 | : null; 144 | //保存日志 145 | try { 146 | qaService.createLog(question, qaAnalyzeResult.getScore() + "", qaAnalyzeResult.getKey(), "manager", ""); 147 | } catch (Exception e) { 148 | e.printStackTrace(); 149 | } 150 | return new PlatformResponse(qaAnalyzeResult.getMatch(), 151 | question, qaAnalyzeResult.getScore(), resultArray, 152 | answer.get("TEXT_ANS"), qaAnalyzeResult.getKey(), answer.get("MEDIA_TYPE"), answer.get("REF_ID"), "1", "", media_url); 153 | } 154 | 155 | 156 | 157 | /** 158 | * getAnswer公共代码 159 | */ 160 | private JSONArray searchAndCalculate(String question) throws IOException, ParseException { 161 | //将目标问题进行分词,留着分析用 162 | List seg_question = CoreStopWordsDictionary.removeStopWords(CoreAbbreviationDictionary.convertAbbreviationToFull(segment.seg(question))); 163 | logger.info("全称转换后:" + seg_question); 164 | IndexSearcher searcher = luceneConfig.getIndexSearcher(); 165 | Query query = new QueryParser(luceneConfig.getIndexKey(), new HanLPAnalyzer()).parse(question); 166 | TopDocs result = searcher.search(query, 100); 167 | JSONArray resultArray = new JSONArray(); 168 | for (ScoreDoc doc : result.scoreDocs) { 169 | Document document = searcher.doc(doc.doc); 170 | String question2 = document.get("questions"); 171 | List seg_question2 = segment.seg(question2); 172 | double score = similarAnalyze.sentenceSimilarity(seg_question, seg_question2); 173 | resultArray.add(new QAAnalyzeResult(score, document.get("key"), question2)); 174 | } 175 | return resultArray; 176 | } 177 | 178 | /** 179 | * 初始化时创建索引 180 | */ 181 | @RequestMapping("/createIndex") 182 | public CheckResult createIndex() { 183 | questionsIndex.createIndex(); 184 | return new CheckResult("000", "success", ""); 185 | } 186 | 187 | /** 188 | * 重新加载全简称词典 189 | */ 190 | @RequestMapping("/reloadAbbreviation") 191 | public CheckResult reloadAbbreviation() { 192 | myCustomDictionary.initAbbreviation(); 193 | return new CheckResult("000", "success", ""); 194 | } 195 | 196 | /** 197 | * 相似性分析核心计算类 198 | */ 199 | public SimilarityAnalyze getSimilarAnalyze() { 200 | return similarAnalyze; 201 | } 202 | 203 | /** 204 | * 获取统一的分词对象 205 | */ 206 | public Segment getSegment() { 207 | return segment; 208 | } 209 | 210 | } 211 | -------------------------------------------------------------------------------- /src/main/java/com/watt/mvc/dao/QADao.java: -------------------------------------------------------------------------------- 1 | package com.watt.mvc.dao; 2 | 3 | import com.watt.data.jdbc.MySqlSessionTemplate; 4 | import org.mybatis.spring.SqlSessionTemplate; 5 | import org.springframework.beans.factory.annotation.Autowired; 6 | import org.springframework.stereotype.Repository; 7 | 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | @Repository 12 | public class QADao { 13 | private SqlSessionTemplate sqlSessionTemplate; 14 | 15 | @Autowired 16 | public QADao(MySqlSessionTemplate sqlSessionTemplate) { 17 | this.sqlSessionTemplate = sqlSessionTemplate.getSqlSessionTemplate(); 18 | } 19 | 20 | /** 21 | * 查询所有的问题列表,根据start 和 end作为大数据下的分页 22 | * 23 | */ 24 | public List> queryQuestions(Map param) { 25 | return sqlSessionTemplate.selectList("QADao.queryQuestions", param); 26 | } 27 | 28 | /** 29 | * 查找答案 30 | * @param map kwid 31 | * @return 一个答案 32 | */ 33 | public List> queryAnswer(Map map) { 34 | return sqlSessionTemplate.selectList("QADao.queryAnswer", map); 35 | } 36 | /** 37 | * 查询所有的词典列表 38 | */ 39 | public List> queryDictionaryAll() { 40 | return sqlSessionTemplate.selectList("QADao.queryDictionaryAll", null); 41 | } 42 | 43 | /** 44 | * 根据条件查询同义词词典 45 | */ 46 | public List> querySynonymsAll(Map param) { 47 | return sqlSessionTemplate.selectList("QADao.querySynonymsAll", param); 48 | } 49 | /** 50 | * 添加税务专用名词词典 51 | * 52 | * @param word 词条或语料 53 | * @return int 54 | */ 55 | public int addTaxDictionaryWord(String word) { 56 | return sqlSessionTemplate.insert("QADao.addTaxDictionaryWord", word); 57 | } 58 | 59 | /** 60 | * 向日志表中插入一条数据,写一条日志 61 | */ 62 | public int createLog(Map param) { 63 | return sqlSessionTemplate.insert("QADao.createLog", param); 64 | } 65 | /** 66 | * 查询同义词列表 67 | */ 68 | public List querySynonyms(){ 69 | return sqlSessionTemplate.selectList("QADao.querySynonyms"); 70 | } 71 | 72 | /** 73 | * 查询全、简称词典 74 | */ 75 | public List> queryAbbreviation(Map param) { 76 | return sqlSessionTemplate.selectList("QADao.queryAbbreviation", param); 77 | } 78 | 79 | /** 80 | * 查询所有停用词 81 | */ 82 | public List queryStopWordsAll() { 83 | return sqlSessionTemplate.selectList("QADao.queryStopWordsAll"); 84 | } 85 | 86 | public Map queryMediaUrlByREF_ID(Map param) { 87 | return sqlSessionTemplate.selectOne("QADao.queryMediaUrlByREF_ID",param); 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/com/watt/mvc/dao/QADao.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 15 | 22 | 26 | 27 | INSERT INTO knowlede_dictionary_custom (WORD) 28 | VALUES (#{word , jdbcType=VARCHAR}) 29 | 30 | 34 | 35 | 42 | 43 | INSERT INTO knowledge_dictionary_synonym (id, synonym) 44 | VALUES (#{id , jdbcType=VARCHAR}, #{synonym , jdbcType=VARCHAR}) 45 | 46 | 47 | 57 | 58 | INSERT INTO knowledge_qa_logs (id, question, score, question_id, channel_id) 59 | VALUES (#{id , jdbcType=VARCHAR}, #{question , jdbcType=VARCHAR}, #{score , jdbcType=VARCHAR}, 60 | #{kw_id , jdbcType=VARCHAR}, #{channel_id , jdbcType=VARCHAR}) 61 | 62 | 63 | 67 | 72 | -------------------------------------------------------------------------------- /src/main/java/com/watt/mvc/service/QAService.java: -------------------------------------------------------------------------------- 1 | package com.watt.mvc.service; 2 | 3 | import com.watt.mvc.dao.QADao; 4 | import org.springframework.beans.factory.annotation.Autowired; 5 | import org.springframework.stereotype.Service; 6 | 7 | import java.util.HashMap; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | @Service 12 | public class QAService { 13 | private QADao qaDao; 14 | 15 | @Autowired 16 | public QAService(QADao qaDao) { 17 | this.qaDao = qaDao; 18 | } 19 | 20 | /** 21 | * 查询所有的问题列表,根据start 和 end 作为大数据下的分页 22 | */ 23 | public List> queryQuestions(int start, int end) { 24 | Map param = new HashMap(); 25 | param.put("start", start); 26 | param.put("end", end); 27 | return qaDao.queryQuestions(param); 28 | } 29 | 30 | /** 31 | * 查询所有的词典列表 32 | */ 33 | public List> queryDictionaryAll() { 34 | return qaDao.queryDictionaryAll(); 35 | } 36 | 37 | /** 38 | * 添加税务专用名词词典 39 | * 40 | * @param word 词条或语料 41 | * @return int 42 | */ 43 | public int addTaxDictionaryWord(String word) { 44 | return qaDao.addTaxDictionaryWord(word); 45 | } 46 | 47 | /** 48 | * 查询同义词列表 49 | */ 50 | public List querySynonyms() { 51 | return qaDao.querySynonyms(); 52 | } 53 | 54 | /** 55 | * 根据条件查询同义词词典 56 | */ 57 | public List> querySynonymsAll(String type) { 58 | Map param = new HashMap<>(); 59 | param.put("type", type); 60 | return qaDao.querySynonymsAll(param); 61 | } 62 | 63 | /** 64 | * 根据问题的id查询答案 65 | */ 66 | public Map queryAnswer(String key) { 67 | Map map = new HashMap<>(); 68 | map.put("key", key); 69 | List> result = qaDao.queryAnswer(map); 70 | if (result == null || result.isEmpty()) { 71 | return null; 72 | } else { 73 | return result.get(0); 74 | } 75 | } 76 | 77 | /** 78 | * 查询全、简称词典 79 | */ 80 | public List> queryAbbreviation(Map param) { 81 | return qaDao.queryAbbreviation(param); 82 | } 83 | 84 | /** 85 | * 向日志表中插入一条数据,写一条日志 86 | */ 87 | public int createLog(String question, String score, String kw_id, String channel_id, String user_id) { 88 | Map param = new HashMap<>(); 89 | param.put("question", question); 90 | param.put("score", score); 91 | param.put("kw_id", kw_id); 92 | param.put("channel_id", channel_id); 93 | param.put("user_id", user_id); 94 | return qaDao.createLog(param); 95 | } 96 | 97 | public List queryStopWordsAll() { 98 | return qaDao.queryStopWordsAll(); 99 | } 100 | 101 | public String queryMediaUrlByREF_ID(String key) { 102 | Map param = new HashMap<>(); 103 | param.put("REF_ID", key); 104 | Map map = qaDao.queryMediaUrlByREF_ID(param); 105 | return map.get("MEDIA_URL").toString(); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/com/watt/util/CommonUtils.java: -------------------------------------------------------------------------------- 1 | package com.watt.util; 2 | 3 | import com.alibaba.fastjson.JSONArray; 4 | import com.alibaba.fastjson.JSONObject; 5 | 6 | public class CommonUtils { 7 | /** 8 | * 给target按照score的大小重新从大到小排序 9 | * @return 排序后的array 10 | */ 11 | public static JSONArray arrayCompare(JSONArray target) { 12 | JSONArray result = new JSONArray(); 13 | for (int i = 0; target.size() > 0; i++) { 14 | JSONObject iobj = findMaxScore(target); 15 | removeOne(target, iobj); 16 | result.add(iobj); 17 | } 18 | return result; 19 | } 20 | 21 | /** 22 | * 获得JSON列表的top N 23 | * 24 | * @param target 目表array 25 | * @param n 多少个 26 | */ 27 | public static JSONArray getTop(JSONArray target, int n) { 28 | JSONArray result = new JSONArray(); 29 | for (int i = 0; i < target.size() && i < n; i++) { 30 | result.add(target.get(i)); 31 | } 32 | return result; 33 | } 34 | 35 | /** 36 | * 遍历整个list找出一个最大的Object 37 | */ 38 | private static JSONObject findMaxScore(JSONArray target) { 39 | JSONObject firstObject = target.getJSONObject(0); 40 | for (int i = 1; i < target.size(); i++) { 41 | if (target.getJSONObject(i).getDouble("score") > firstObject.getDouble("score")) { 42 | firstObject = target.getJSONObject(i); 43 | } 44 | } 45 | return firstObject; 46 | } 47 | 48 | /** 49 | * 移除某个key为制定的object 50 | */ 51 | private static void removeOne(JSONArray target, JSONObject one) { 52 | for (int i = 0; i < target.size(); i++) { 53 | if (target.getJSONObject(i).get("questionID").equals(one.get("questionID"))) { 54 | target.remove(i); 55 | return; 56 | } 57 | } 58 | } 59 | 60 | // public static void main(String[] args) { 61 | // JSONArray array = new JSONArray(); 62 | // JSONObject object1 = new JSONObject(); 63 | // JSONObject object2 = new JSONObject(); 64 | // JSONObject object3 = new JSONObject(); 65 | // JSONObject object4 = new JSONObject(); 66 | // 67 | // object1.put("score",2.44); 68 | // object1.put("key",2.44); 69 | // object2.put("score",4.454); 70 | // object2.put("key",4.454); 71 | // object3.put("score",3.24); 72 | // object3.put("key",3.24); 73 | // object4.put("score",16.00); 74 | // object4.put("key",16.00); 75 | // array.add(object1); 76 | // array.add(object2); 77 | // array.add(object3); 78 | // array.add(object4); 79 | // System.out.println(arrayCompare(array).toJSONString()); 80 | // } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/watt/util/FileUtils.java: -------------------------------------------------------------------------------- 1 | package com.watt.util; 2 | 3 | import java.io.*; 4 | import java.nio.charset.Charset; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.Objects; 8 | 9 | public class FileUtils { 10 | /** 11 | * 清除某个目录下所有的文件 12 | * 13 | * @param path 目标path 14 | */ 15 | public static void clearPath(String path) { 16 | File file = new File(path); 17 | if (file.isDirectory()) { 18 | File[] childFiles = file.listFiles(); 19 | if (childFiles == null || file.length() == 0) { 20 | return; 21 | } 22 | for (File childFile : childFiles) { 23 | childFile.delete(); 24 | } 25 | } else { 26 | file.delete(); 27 | } 28 | } 29 | 30 | /** 31 | * 将文件目录下所有文件全部罗列出来进行 32 | * 33 | * @param path 父文件路径 34 | * @return 所有的文件列表 35 | */ 36 | public static List listFiles(String path) { 37 | List result = new ArrayList<>(); 38 | File file = new File(path); 39 | for (File one : Objects.requireNonNull(file.listFiles())) { 40 | if (one.isDirectory()) { 41 | result.addAll(listFiles(one.getPath())); 42 | } else { 43 | result.add(one); 44 | } 45 | } 46 | return result; 47 | } 48 | 49 | /** 50 | * 给定文件返回读取方法 51 | * 52 | * @param file 目标文件 53 | * @return 大文件读取的文件流 54 | */ 55 | public static BufferedReader getFileReader(File file) throws FileNotFoundException { 56 | return new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(file)), Charset.forName("GBK"))); 57 | } 58 | 59 | /** 60 | * 给定文件返回读取方法 61 | * 62 | * @param file 目标文件 63 | * @return 大文件读取的文件流 64 | */ 65 | public static BufferedReader getFileReader(File file, Charset charset) throws FileNotFoundException { 66 | return new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(file)), charset)); 67 | } 68 | /** 69 | * 给定文件返回读取方法 70 | * 71 | * @param file 目标文件 72 | * @return 大文件读取的文件流 73 | */ 74 | public static BufferedReader getFileReader(String file) throws FileNotFoundException { 75 | return new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(file)))); 76 | } 77 | 78 | public static String readLine(BufferedReader reader) throws IOException { 79 | return reader.readLine(); 80 | } 81 | 82 | /** 83 | * @throws FileNotFoundException 文件找不到 84 | */ 85 | public static BufferedWriter getBufferedWriter(String fileName) throws IOException { 86 | File createFile = new File(fileName); 87 | if (!createFile.exists()) { 88 | createFile.createNewFile(); 89 | } 90 | return new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(createFile)))); 91 | } 92 | public static void main(String[] args){ 93 | List files = listFiles("/root/data/corpus/"); 94 | files.forEach(file -> { 95 | System.out.println(file.getParent()+","+file.getPath()); 96 | }); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/com/watt/util/NLPUtils.java: -------------------------------------------------------------------------------- 1 | package com.watt.util; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import com.hankcs.hanlp.seg.common.Term; 5 | import com.hankcs.hanlp.tokenizer.NotionalTokenizer; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.BufferedWriter; 9 | import java.io.File; 10 | import java.io.IOException; 11 | import java.util.List; 12 | 13 | public class NLPUtils { 14 | /** 15 | * 训练语料文件预处理 16 | * 1.分词 17 | * 2.繁转简 18 | * 3.去停用词 19 | * 20 | * @param file 目标文件 21 | * @param target 存储到的目标文件 22 | */ 23 | public static void textPreprocessing(File file, String target) throws IOException { 24 | BufferedReader reader = FileUtils.getFileReader(file); 25 | BufferedWriter writer = FileUtils.getBufferedWriter(target); 26 | String wordLine = null; 27 | while ((wordLine = reader.readLine()) != null) { 28 | wordLine = HanLP.tw2s(wordLine); 29 | List termList = NotionalTokenizer.segment(wordLine); 30 | String line = convertTermtoString(termList); 31 | writer.newLine(); 32 | writer.write(line); 33 | } 34 | writer.flush(); 35 | reader.close(); 36 | writer.close(); 37 | } 38 | 39 | /** 40 | * 将分词数据拼接成字符串 41 | */ 42 | public static String convertTermtoString(List termList, String segChar) { 43 | StringBuffer buffer = new StringBuffer(); 44 | termList.forEach(term -> { 45 | buffer.append(term.word).append(segChar); 46 | }); 47 | return buffer.toString().trim(); 48 | } 49 | 50 | /** 51 | * 将分词数据拼接成字符串 52 | */ 53 | public static String convertTermtoString(List termList) { 54 | return convertTermtoString(termList," "); 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src/main/resources/application.yml: -------------------------------------------------------------------------------- 1 | db: 2 | mysql: 3 | driverClass: com.mysql.cj.jdbc.Driver #这个是mysql驱动配置不需要改动 4 | jdbcUrl: jdbc:mysql://10.111.29.21:3306/tax_knowledge?useUnicode=true&characterEncoding=gb2312 #mysql地址端口号配置 5 | user: root #mysql用户名和密码配置 6 | password: Abcd1234! #mysql用户名和密码配置 7 | mybatis: 8 | mybatisXml: /mybatis.xml #mybatis文件配置路径 9 | lucene: 10 | root: /root/lucene/ #Lucene索引位置的根目录 11 | indexKey: questionWithSynonyms #这个是Lucene查询、建立索引的时候共享的一个key,这个key可以一直不改变 12 | vectorPath: /root/data/wiki_chinese_word2vec.bin #词向量物理路径 13 | tfidfPath: /root/data/tfidf #tfidf模型路径 -------------------------------------------------------------------------------- /src/main/resources/hanlp.properties: -------------------------------------------------------------------------------- 1 | #本配置文件中的路径的根目录,根目录+其他路径=完整路径(支持相对路径,请参考:https://github.com/hankcs/HanLP/pull/254) 2 | #Windows用户请注意,路径分隔符统一使用/ 3 | root=/root/ 4 | #核心词典路径 5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt 6 | #2元语法词典路径 7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt 8 | #停用词词典路径 9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt 10 | #同义词词典路径 11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt 12 | #人名词典路径 13 | PersonDictionaryPath=data/dictionary/person/nr.txt 14 | #人名词典转移矩阵路径 15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt 16 | #繁简词典根目录 17 | tcDictionaryRoot=data/dictionary/tc 18 | #自定义词典路径,用;隔开多个自定义词典,空格开头表示在同一个目录,使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。 19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库,请不要删除。所有词典统一使用UTF-8编码。 20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf; 21 | #CRF分词模型路径 22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt 23 | #HMM分词模型 24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin 25 | #分词结果是否展示词性 26 | ShowTermNature=true 27 | #IO适配器,实现com.hankcs.hanlp.corpus.io.IIOAdapter接口以在不同的平台(Hadoop、Redis等)上运行HanLP 28 | #默认的IO适配器如下,该适配器是基于普通文件系统的。 29 | #IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter -------------------------------------------------------------------------------- /src/main/resources/mybatis.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/lib/hanlp-1.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/watt1010/knowledge/9141c30f039bc924a0ef8c0e83e5233d1042ce04/src/main/webapp/WEB-INF/lib/hanlp-1.7.2.jar -------------------------------------------------------------------------------- /src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | Archetype Created Web Application 7 | 8 | -------------------------------------------------------------------------------- /src/main/webapp/index.jsp: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Hello World!

4 | 5 | 6 | -------------------------------------------------------------------------------- /src/test/java/MapCount.java: -------------------------------------------------------------------------------- 1 | import java.io.Serializable; 2 | import java.util.HashMap; 3 | 4 | public class MapCount implements Serializable { 5 | private HashMap data = null; 6 | 7 | //[0] 表示词频 8 | //【1】文档数 9 | //【2】index 10 | MapCount() { 11 | this.data = new HashMap<>(); 12 | } 13 | 14 | public MapCount(int initialCapacity) { 15 | this.data = new HashMap<>(initialCapacity); 16 | } 17 | 18 | private void add(T key, int index, int n) { 19 | int[] value; 20 | if ((value = this.data.get(key)) != null) { 21 | value[index] = value[index] + n; 22 | this.data.put(key, value); 23 | } else { 24 | value = new int[3]; 25 | value[index] = value[index] + n; 26 | this.data.put(key, value); 27 | } 28 | 29 | } 30 | 31 | void add(T key, int index) { 32 | this.add(key, index, 1); 33 | } 34 | 35 | public int size() { 36 | return this.data.size(); 37 | } 38 | 39 | public void remove(T t) { 40 | this.data.remove(t); 41 | } 42 | 43 | public HashMap get() { 44 | return this.data; 45 | } 46 | } -------------------------------------------------------------------------------- /src/test/java/wikiCorpus.java: -------------------------------------------------------------------------------- 1 | import com.hankcs.hanlp.HanLP; 2 | import com.hankcs.hanlp.corpus.document.sentence.Sentence; 3 | import com.hankcs.hanlp.corpus.document.sentence.word.IWord; 4 | import com.hankcs.hanlp.seg.common.Term; 5 | import com.hankcs.hanlp.tokenizer.NLPTokenizer; 6 | import com.watt.util.FileUtils; 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.junit.Test; 10 | 11 | import java.io.*; 12 | import java.nio.charset.Charset; 13 | import java.util.*; 14 | 15 | public class wikiCorpus { 16 | 17 | /** 18 | * 文档总数:1020652 19 | * 20 | * @throws Exception error stop 21 | */ 22 | @Test 23 | public void generateKeySet() throws Exception { 24 | BufferedReader reader = FileUtils.getFileReader(new File("D:\\corpus\\data\\zh_wiki_00"), Charset.forName("UTF-8")); 25 | MapCount mapCount = new MapCount<>(); 26 | StringBuilder wordLine = new StringBuilder(); 27 | String temp = null; 28 | int count = 1; 29 | while ((temp = reader.readLine()) != null) { 30 | wordLine.append(temp); 31 | if (wordLine.indexOf(" -1 && wordLine.indexOf("") > -1) { 32 | int start = wordLine.indexOf("") + 6; 34 | String s = wordLine.substring(start, end); 35 | wordLine.delete(start, end); 36 | 37 | Document doc = Jsoup.parse(s); 38 | Sentence sentence = NLPTokenizer.ANALYZER.analyze(HanLP.tw2s(doc.select("doc").text())); 39 | for (IWord iWord : Objects.requireNonNull(sentence).wordList) {// keySet.add(iWord.getValue()); 40 | int[] value = Optional.ofNullable(mapCount.get().get(iWord.getValue())).orElse(new int[3]); 41 | if (value[2] != count) { 42 | value[2] = count; //重置文档计数 43 | value[1] = value[1] + 1; //文档数 +1 44 | value[0] = value[0] + 1; //总词频数 +1 45 | } else { 46 | value[0] = value[0] + 1; 47 | } 48 | mapCount.get().put(iWord.getValue(), value); 49 | } 50 | System.out.println("count:" + count++); 51 | // if (count % 500 == 0) { 52 | // ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\keyset\\" + UUID.randomUUID().toString()))); 53 | // out.writeObject(mapCount); 54 | // out.flush(); 55 | // out.close(); 56 | // } 57 | } 58 | 59 | } 60 | ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\keySetNLP"))); 61 | out.writeObject(mapCount); 62 | out.flush(); 63 | out.close(); 64 | System.out.println("count:" + count); 65 | reader.close(); 66 | } 67 | 68 | @Test 69 | public void generateKeySet2() throws Exception { 70 | BufferedReader reader = FileUtils.getFileReader(new File("D:\\corpus\\data\\zh_wiki_00"), Charset.forName("UTF-8")); 71 | MapCount mapCount = new MapCount<>(); 72 | StringBuilder wordLine = new StringBuilder(); 73 | String temp = null; 74 | int count = 1; 75 | while ((temp = reader.readLine()) != null) { 76 | wordLine.append(temp); 77 | if (wordLine.indexOf(" -1 && wordLine.indexOf("") > -1) { 78 | int start = wordLine.indexOf("") + 6; 80 | String s = wordLine.substring(start, end); 81 | wordLine.delete(start, end); 82 | 83 | Document doc = Jsoup.parse(s); 84 | List sentence = HanLP.segment(HanLP.tw2s(doc.select("doc").text())); 85 | for (Term iWord : Objects.requireNonNull(sentence)) {// keySet.add(iWord.getValue()); 86 | int[] value = Optional.ofNullable(mapCount.get().get(iWord.word)).orElse(new int[3]); 87 | if (value[2] != count) { 88 | value[2] = count; //重置文档计数 89 | value[1] = value[1] + 1; //文档数 +1 90 | value[0] = value[0] + 1; //总词频数 +1 91 | } else { 92 | value[0] = value[0] + 1; 93 | } 94 | mapCount.get().put(iWord.word, value); 95 | } 96 | System.out.println("count:" + count++); 97 | // if (count % 500 == 0) { 98 | // ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\keyset\\" + UUID.randomUUID().toString()))); 99 | // out.writeObject(mapCount); 100 | // out.flush(); 101 | // out.close(); 102 | // } 103 | } 104 | 105 | } 106 | ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\keySet1"))); 107 | out.writeObject(mapCount); 108 | out.flush(); 109 | out.close(); 110 | System.out.println("count:" + count); 111 | reader.close(); 112 | } 113 | 114 | @Test 115 | public void caltfidf() throws Exception { 116 | ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File("D:\\corpus\\data\\keySet1"))); 117 | MapCount mapCount = (MapCount) in.readObject(); 118 | Map tfidf = new HashMap(); 119 | for (Map.Entry one : mapCount.get().entrySet()) { 120 | tfidf.put(one.getKey(), one.getValue()[0] * Math.log(1020652.0 / one.getValue()[2])); 121 | } 122 | ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\tfidf"))); 123 | out.writeObject(tfidf); 124 | out.flush(); 125 | out.close(); 126 | } 127 | @Test 128 | public void testcaltfidf() throws Exception { 129 | ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File("/home/watt/corpus/data/keySet1"))); 130 | MapCount mapCount = (MapCount) in.readObject(); 131 | Map tfidf = new HashMap(); 132 | for (Map.Entry one : mapCount.get().entrySet()) { 133 | tfidf.put(one.getKey(), one.getValue()[0] * Math.log(1020652.0 / one.getValue()[2])); 134 | } 135 | 136 | } 137 | @Test 138 | public void readTfidf() throws Exception{ 139 | ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File("/home/watt/corpus/data/tfidf"))); 140 | Map tfidf = (HashMap)in.readObject(); 141 | System.out.println(tfidf.get("的")); 142 | System.out.println(tfidf.get("是")); 143 | System.out.println(tfidf.get("为什么")); 144 | } 145 | 146 | } 147 | --------------------------------------------------------------------------------