├── .gitignore
├── init.sql
├── pom.xml
├── readme.md
└── src
    ├── main
        ├── java
        │   └── com
        │   │   ├── hankcs
        │   │       └── lucene
        │   │       │   ├── HanLPAnalyzer.java
        │   │       │   ├── HanLPIndexAnalyzer.java
        │   │       │   ├── HanLPTokenizer.java
        │   │       │   ├── HanLPTokenizerFactory.java
        │   │       │   ├── PorterStemmer.java
        │   │       │   └── SegmentWrapper.java
        │   │   └── watt
        │   │       ├── CloudApplication.java
        │   │       ├── configure
        │   │           ├── DbConfig.java
        │   │           ├── LuceneConfig.java
        │   │           └── MybatisConfig.java
        │   │       ├── core
        │   │           ├── QuestionsIndex.java
        │   │           ├── dictionary
        │   │           │   ├── CoreAbbreviationDictionary.java
        │   │           │   ├── CoreStopWordsDictionary.java
        │   │           │   └── MyCustomDictionary.java
        │   │           └── nlp
        │   │           │   └── cosinesimlarity
        │   │           │       ├── AtomSegment.java
        │   │           │       ├── IDExtract.java
        │   │           │       ├── SimilarityAnalyze.java
        │   │           │       ├── SimilarityAnalyzeUnfamiliarWords.java
        │   │           │       └── Word2Vec.java
        │   │       ├── data
        │   │           └── jdbc
        │   │           │   ├── MySqlDataSource.java
        │   │           │   ├── MySqlSessionFactoryBean.java
        │   │           │   └── MySqlSessionTemplate.java
        │   │       ├── mvc
        │   │           ├── beans
        │   │           │   ├── CheckResult.java
        │   │           │   ├── PlatformResponse.java
        │   │           │   └── QAAnalyzeResult.java
        │   │           ├── controller
        │   │           │   ├── CorpusController.java
        │   │           │   └── QAController.java
        │   │           ├── dao
        │   │           │   ├── QADao.java
        │   │           │   └── QADao.xml
        │   │           └── service
        │   │           │   └── QAService.java
        │   │       └── util
        │   │           ├── CommonUtils.java
        │   │           ├── FileUtils.java
        │   │           └── NLPUtils.java
        ├── resources
        │   ├── application.yml
        │   ├── hanlp.properties
        │   └── mybatis.xml
        └── webapp
        │   ├── WEB-INF
        │       ├── lib
        │       │   └── hanlp-1.7.2.jar
        │       └── web.xml
        │   └── index.jsp
    └── test
        └── java
            ├── MapCount.java
            └── wikiCorpus.java


/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Maven Auto Generate ###
 2 | target/
 3 | !.mvn/wrapper/maven-wrapper.jar
 4 | 
 5 | ### STS ###
 6 | .apt_generated
 7 | .classpath
 8 | .factorypath
 9 | .project
10 | .settings
11 | .springBeans
12 | 
13 | ### IntelliJ IDEA ###
14 | *.idea
15 | *.iws
16 | *.iml
17 | *.ipr
18 | 
19 | ### NetBeans ###
20 | nbproject/private/
21 | build/
22 | nbbuild/
23 | dist/
24 | nbdist/
25 | .nb-gradle/
26 | 
27 | ### Log Files in Linux And macOS Environment ###
28 | /C:/
29 | 


--------------------------------------------------------------------------------
/init.sql:
--------------------------------------------------------------------------------
  1 | create table knowlede_dictionary_custom
  2 | (
  3 |   uuid int auto_increment
  4 |     primary key,
  5 |   word varchar(40) null
  6 | )
  7 |   charset = gb2312;
  8 | 
  9 | create table knowledge_category
 10 | (
 11 |   id            varchar(64)  not null comment '主键'
 12 |     primary key,
 13 |   create_by     varchar(64)  null comment '创建者',
 14 |   create_date   datetime     null comment '创建时间',
 15 |   update_by     varchar(64)  null comment '更新者',
 16 |   update_date   datetime     null comment '更新时间',
 17 |   remarks       varchar(255) null comment '备注信息',
 18 |   del_flag      varchar(64)  null comment '逻辑删除标记（0：显示；1：隐藏）',
 19 |   category_name varchar(64)  null comment '类别名'
 20 | )
 21 |   comment '知识库类别表';
 22 | 
 23 | create table knowledge_dictionary_abbreviation
 24 | (
 25 |   id          varchar(64)  not null comment '主键'
 26 |     primary key,
 27 |   create_by   varchar(64)  null comment '创建者',
 28 |   create_date datetime     null comment '创建时间',
 29 |   update_by   varchar(64)  null comment '更新者',
 30 |   update_date datetime     null comment '更新时间',
 31 |   remarks     varchar(255) null comment '备注信息',
 32 |   del_flag    varchar(64)  null comment '逻辑删除标记（0：显示；1：隐藏）',
 33 |   abbr_name   varchar(64)  null comment '简称',
 34 |   full_name   varchar(64)  null comment '全称'
 35 | )
 36 |   comment '全简称管理';
 37 | 
 38 | create table knowledge_dictionary_industry
 39 | (
 40 |   id            varchar(32)                        not null comment '主键'
 41 |     primary key,
 42 |   industry_name varchar(50)                        not null comment '行业名称',
 43 |   create_time   datetime default CURRENT_TIMESTAMP not null comment '创建时间',
 44 |   is_valid      char     default 'Y'               not null comment '有效标志:Y有效,N无效'
 45 | )
 46 |   comment '行业字典表' charset = gb2312;
 47 | 
 48 | create table knowledge_dictionary_stopwords
 49 | (
 50 |   id   int(10) auto_increment
 51 |     primary key,
 52 |   word varchar(255) null
 53 | )
 54 |   collate = utf8_bin;
 55 | 
 56 | create table knowledge_dictionary_synonym
 57 | (
 58 |   id          varchar(64) charset gbk not null comment '主键'
 59 |     primary key,
 60 |   create_by   varchar(64)             null comment '创建者',
 61 |   create_date datetime                null comment '创建时间',
 62 |   update_by   varchar(64)             null comment '更新者',
 63 |   update_date datetime                null comment '更新时间',
 64 |   remarks     varchar(255)            null comment '备注信息',
 65 |   del_flag    varchar(64)             not null comment '逻辑删除标记（0：显示；1：隐藏）',
 66 |   synonym     varchar(1000)           not null comment '同义词',
 67 |   type        varchar(1)              not null comment '类型近似和相等'
 68 | )
 69 |   comment '同义词词典';
 70 | 
 71 | create table knowledge_qa_answer
 72 | (
 73 |   id           varchar(64)  not null comment '主键'
 74 |     primary key,
 75 |   create_by    varchar(64)  null comment '创建者',
 76 |   create_date  datetime     null comment '创建时间',
 77 |   update_by    varchar(64)  null comment '更新者',
 78 |   update_date  datetime     null comment '更新时间',
 79 |   remarks      varchar(255) null comment '备注信息',
 80 |   del_flag     varchar(64)  null comment '逻辑删除标记（0：显示；1：隐藏）',
 81 |   answer       longtext     null comment '答案',
 82 |   reference_id varchar(64)  null comment '媒体类型引用',
 83 |   media_type   varchar(4)   null comment '媒体类型',
 84 |   category_id  varchar(64)  null comment '类别id'
 85 | )
 86 |   comment '问答答案表';
 87 | 
 88 | create table knowledge_qa_logs
 89 | (
 90 |   id          varchar(64)                           not null comment '主键'
 91 |     primary key,
 92 |   create_by   varchar(64)                           null comment '创建者',
 93 |   create_date datetime    default CURRENT_TIMESTAMP null comment '创建时间',
 94 |   update_by   varchar(64)                           null comment '更新者',
 95 |   update_date datetime                              null comment '更新时间',
 96 |   remarks     varchar(255)                          null comment '备注信息',
 97 |   del_flag    varchar(64) default '0'               null comment '逻辑删除标记（0：显示；1：隐藏）',
 98 |   question    varchar(128)                          null comment '问题',
 99 |   score       varchar(64)                           null comment '评分',
100 |   channel_id  varchar(64)                           null comment '授权ID，接入渠道（微信，机器人等）',
101 |   question_id varchar(64)                           null comment '问题ID'
102 | )
103 |   comment '日志表';
104 | 
105 | create table knowledge_qa_media
106 | (
107 |   MEDIA_ID      int auto_increment comment '素材ID'
108 |     primary key,
109 |   MEDIA_NAME    varchar(100)  not null comment '素材名称',
110 |   MEDIA_SUMMARY varchar(2048) null comment '素材摘要',
111 |   MEDIA_TYPE    varchar(3)    null comment '素材类型(GT:图文 IMG:图片 AU:语音 VI:视频)',
112 |   MEDIA_URL     varchar(512)  null comment '素材链接(若本字段非空，素材文件中存储为素材的封面图片)',
113 |   CREATE_TIME   datetime      not null comment '创建时间',
114 |   UPDATE_TIME   datetime      null comment '更新时间'
115 | )
116 |   comment '素材表';
117 | 
118 | create table knowledge_qa_question
119 | (
120 |   id          varchar(64)             not null comment '主键'
121 |     primary key,
122 |   create_by   varchar(64)             null comment '创建者',
123 |   create_date datetime                null comment '创建时间',
124 |   update_by   varchar(64)             null comment '更新者',
125 |   update_date datetime                null comment '更新时间',
126 |   remarks     varchar(255)            null comment '备注信息',
127 |   del_flag    varchar(64) default '0' null comment '逻辑删除标记（0：显示；1：隐藏）',
128 |   question    varchar(64)             null comment '问题',
129 |   answer_id   varchar(64)             null comment '答案id'
130 | )
131 |   comment '知识库问答答案表';
132 | 
133 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | 
  3 | <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
  4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |     <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |     <groupId>com.inspur.tax</groupId>
  8 |     <artifactId>knowledge-core</artifactId>
  9 |     <version>1.0-SNAPSHOT</version>
 10 |     <packaging>war</packaging>
 11 | 
 12 |     <name>knowledge-core Maven Webapp</name>
 13 |     <properties>
 14 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 15 |         <maven.compiler.source>1.8</maven.compiler.source>
 16 |         <maven.compiler.target>1.8</maven.compiler.target>
 17 |         <lucene.version>7.4.0</lucene.version>
 18 |         <fastjson.version>1.2.35</fastjson.version>
 19 |         <mysql.connector>8.0.11</mysql.connector>
 20 |         <spring.version>5.0.8.RELEASE</spring.version>
 21 |         <c3p0.version>0.9.5.3</c3p0.version>
 22 |         <mybatis-spring.version>1.3.2</mybatis-spring.version>
 23 |         <mybatis.version>3.4.4</mybatis.version>
 24 |         <lucene.version>7.4.0</lucene.version>
 25 |         <hanlp.version>1.7.2</hanlp.version>
 26 |     </properties>
 27 |     <parent>
 28 |         <groupId>org.springframework.boot</groupId>
 29 |         <artifactId>spring-boot-starter-parent</artifactId>
 30 |         <version>2.0.4.RELEASE</version>
 31 |         <relativePath/> <!-- lookup parent from repository -->
 32 |     </parent>
 33 |     <dependencies>
 34 |         <dependency>
 35 |             <groupId>junit</groupId>
 36 |             <artifactId>junit</artifactId>
 37 |             <version>4.11</version>
 38 |             <scope>test</scope>
 39 |         </dependency>
 40 |         <dependency>
 41 |             <!-- jsoup HTML parser library @ https://jsoup.org/ -->
 42 |             <groupId>org.jsoup</groupId>
 43 |             <artifactId>jsoup</artifactId>
 44 |             <version>1.11.3</version>
 45 |             <scope>test</scope>
 46 |         </dependency>
 47 |         <dependency>
 48 |             <groupId>com.alibaba</groupId>
 49 |             <artifactId>fastjson</artifactId>
 50 |             <version>${fastjson.version}</version>
 51 |         </dependency>
 52 |         <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
 53 |         <dependency>
 54 |             <groupId>org.apache.lucene</groupId>
 55 |             <artifactId>lucene-queryparser</artifactId>
 56 |             <version>${lucene.version}</version>
 57 |         </dependency>
 58 |         <dependency>
 59 |             <groupId>org.apache.lucene</groupId>
 60 |             <artifactId>lucene-highlighter</artifactId>
 61 |             <version>${lucene.version}</version>
 62 |             <scope>test</scope>
 63 |         </dependency>
 64 |         <dependency>
 65 |             <groupId>org.apache.lucene</groupId>
 66 |             <artifactId>lucene-analyzers-common</artifactId>
 67 |             <version>${lucene.version}</version>
 68 |         </dependency>
 69 |         <!-- spring boot 工具包-->
 70 |         <dependency>
 71 |             <groupId>org.springframework.boot</groupId>
 72 |             <artifactId>spring-boot-starter</artifactId>
 73 |         </dependency>
 74 |         <dependency>
 75 |             <groupId>org.springframework.boot</groupId>
 76 |             <artifactId>spring-boot-starter-web</artifactId>
 77 |         </dependency>
 78 |         <dependency>
 79 |             <groupId> org.springframework.boot </groupId>
 80 |             <artifactId> spring-boot-configuration-processor </artifactId>
 81 |             <optional> true </optional>
 82 |         </dependency>
 83 |         <!--mybatis相关配置-->
 84 |         <dependency>
 85 |             <groupId>org.mybatis</groupId>
 86 |             <artifactId>mybatis</artifactId>
 87 |             <version>${mybatis.version}</version>
 88 |         </dependency>
 89 |         <dependency>
 90 |             <groupId>org.mybatis</groupId>
 91 |             <artifactId>mybatis-spring</artifactId>
 92 |             <version>${mybatis-spring.version}</version>
 93 |         </dependency>
 94 |         <dependency>
 95 |             <groupId>com.mchange</groupId>
 96 |             <artifactId>c3p0</artifactId>
 97 |             <version>${c3p0.version}</version>
 98 |         </dependency>
 99 |         <!--spring jdbc配合mybatis使用-->
100 |         <dependency>
101 |             <groupId>org.springframework</groupId>
102 |             <artifactId>spring-jdbc</artifactId>
103 |             <version>${spring.version}</version>
104 |         </dependency>
105 | 
106 |         <dependency>
107 |             <groupId>mysql</groupId>
108 |             <artifactId>mysql-connector-java</artifactId>
109 |             <version>${mysql.connector}</version>
110 |         </dependency>
111 | 
112 |         <dependency>
113 |             <groupId>com.hankcs</groupId>
114 |             <artifactId>hanlp</artifactId>
115 |             <version>${hanlp.version}</version>
116 |             <scope>system</scope>
117 |             <systemPath>${project.basedir}/src/main/webapp/WEB-INF/lib/hanlp-${hanlp.version}.jar</systemPath>
118 |         </dependency>
119 |     </dependencies>
120 | 
121 |     <build>
122 |         <finalName>knowledge</finalName>
123 |         <resources>
124 |             <resource>
125 |                 <directory>src/main/java</directory>
126 |                 <includes>
127 |                     <include>**/*.properties</include>
128 |                     <include>**/*.xml</include>
129 |                 </includes>
130 |                 <filtering>false</filtering>
131 |             </resource>
132 |             <resource>
133 |                 <directory>src/main/resources</directory>
134 |                 <includes>
135 |                     <include>**/</include>
136 |                 </includes>
137 |                 <filtering>false</filtering>
138 |             </resource>
139 |             <resource>
140 |                 <directory>src/main/webapp</directory>
141 |                 <includes>
142 |                     <include>**/*.properties</include>
143 |                     <include>**/*.xml</include>
144 |                 </includes>
145 |                 <filtering>false</filtering>
146 |             </resource>
147 | 
148 |             <resource>
149 |                 <directory>src/main/webapp/WEB-INF</directory>
150 |                 <targetPath>BOOT-INF/lib/</targetPath>
151 |                 <includes>
152 |                     <include>**/*.jar</include>
153 |                 </includes>
154 |             </resource>
155 |         </resources>
156 |         <plugins>
157 |             <plugin>
158 |                 <groupId>org.springframework.boot</groupId>
159 |                 <artifactId>spring-boot-maven-plugin</artifactId>
160 |             </plugin>
161 |             <!-- 跳过单元测试 -->
162 |             <plugin>
163 |                 <groupId>org.apache.maven.plugins</groupId>
164 |                 <artifactId>maven-surefire-plugin</artifactId>
165 |                 <configuration>
166 |                     <skipTests>true</skipTests>
167 |                 </configuration>
168 |             </plugin>
169 |         </plugins>
170 |     </build>
171 | </project>
172 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # 咨询知识库系统概要
  2 | 感谢HanLp、Luncene等开源系统给我们开发系统带来的便捷，也诚挚的邀请各位大神参与完善这个项目来供更多的人学习使用
  3 | ##安装部署
  4 | ###语料数据准备：
  5 | 百度网盘下载地址：https://pan.baidu.com/s/1Syhk2Ehv_5Of19bHlFNSig 
  6 | 提取码：04ay  
  7 | ###数据库初始化
  8 | 数据库采用mysql进行静态的数据存储，请在网盘中一并下载
  9 | ```
 10 | init.sql
 11 | ```
 12 | ###项目构建工具
 13 | 项目采用maven的项目管理工具管理，并且采用的SpringBoot微服务框架开发。此处建议使用idea、eclipse等工具进行编辑开发。
 14 | ###配置说明
 15 | ####HanLp配置说明（hanlp.properties）：
 16 | 项目中只需要配置这个root就可以了，root参数是HanLp分词数据包的物理路径
 17 | ``` properties
 18 | root=/root/
 19 | ```
 20 | ####Lucene、词向量项目配置说明
 21 | application.yml
 22 | ``` yaml
 23 | lucene:
 24 |     root: /root/lucene/  #Lucene索引位置的根目录
 25 |     indexKey: questionWithSynonyms #这个是Lucene查询、建立索引的时候共享的一个key，这个key可以一直不改变
 26 |     vectorPath: /root/data/wiki_chinese_word2vec.bin #词向量物理路径
 27 | ```
 28 | ####mysql数据库配置
 29 | ```yaml
 30 | db:
 31 |   mysql:
 32 |       driverClass: com.mysql.cj.jdbc.Driver#这个是mysql驱动配置不需要改动
 33 |       jdbcUrl: jdbc:mysql://IP:3306/tax_knowledge?useUnicode=true&characterEncoding=gb2312 #mysql地址端口号配置
 34 |       user: username #mysql用户名和密码配置
 35 |       password: password #mysql用户名和密码配置
 36 | ```
 37 | ###项目编译
 38 | 执行maven命令，如果不懂maven的童鞋，请恶补一下基础知识，哈哈
 39 | ```
 40 | mvn clean install 
 41 | ```
 42 | ###启动应用
 43 | springboot项目启动只需要启动编译好的编译包就可以了，不懂springboot的童鞋要使劲学习啦。
 44 | ```
 45 | java -jar knowledge.war
 46 | ```
 47 | ###创建索引库
 48 | 通过浏览器访问创建索引的接口：
 49 | ```
 50 | http://ip:8080/createIndex
 51 | ```
 52 | ###测试结果
 53 | 浏览器访问
 54 | ```
 55 | http://ip:8080/getAnswer?question=收不到验证码
 56 | ```
 57 | ## 	语义相似度（余弦相似度分析、词林）
 58 | 语义相似度计算采用余弦相似度计算的方法，针对专业性的知识库，经验证，余弦相似度最适合，调用方式（SimilarityAnalyzeUnfamiliarWords）为： 
 59 | ``` java
 60 | double score = similarAnalyze.sentenceSimilarity(seg_question, seg_question2);
 61 | //计算第一句话得每一个词和另一句话中最相似的词的相似度
 62 | for (Term sentence1Word : sentence1Words) {
 63 |     ++count1;
 64 |     sum1 += this.calMaxSimilarity(sentence1Word.word, sentence2Words);
 65 | }
 66 | //计算第二句话得每一个词和另一句话中最相似的词的相似度
 67 | for (Term sentence2Word : sentence2Words) {
 68 |     ++count2;
 69 |     sum2 += this.calMaxSimilarity(sentence2Word.word, sentence1Words);
 70 | }
 71 | //检测数量是不是为0是为了避免计算过程中产生NAN导致报错
 72 | if (count1 == 0) {
 73 |     if (count2 == 0) {
 74 |         return 0F;
 75 |     } else {
 76 |         return sum2 / count2;
 77 |     }
 78 | } else if (count2 == 0) {
 79 |     return sum1 / count1;
 80 | }
 81 | //去相似度最小的那个，能够避免长短文本比较而产生文本包含关系的问题
 82 | return Math.min(sum1 / (count1), sum2 / count2);
 83 | ```
 84 | 相似度计算实现原理：
 85 | 
 86 | 相似度计算代码实现：
 87 | ```
 88 | for (int i = 0; i < vec1.length; ++i) {
 89 |     dist += vec1[i] * vec2[i];
 90 |     sum1 += Math.pow(vec1[i], 2);
 91 |     sum2 += Math.pow(vec2[i], 2);
 92 | }
 93 | double result = dist / Math.sqrt(sum1 * sum2);
 94 | //在计算过程中，由于浮点运算的偏差问题，存在比较小的误差，
 95 | //为避免大于1这种情况，对后续计算过程中的英雄，暂时的将相似度控制到100%以内
 96 | return result > 1.0 ? 1.0D : result;
 97 | ```
 98 | ##	程序功能代码入口
 99 | 所有的程序功能入口都在QAController类中
100 | 


--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/HanLPAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.hankcs.lucene;
 2 | 
 3 | import com.hankcs.hanlp.HanLP;
 4 | import org.apache.lucene.analysis.Analyzer;
 5 | import org.apache.lucene.analysis.Tokenizer;
 6 | 
 7 | import java.util.Set;
 8 | 
 9 | public class HanLPAnalyzer extends Analyzer {
10 |     private boolean enablePorterStemming;
11 |     private Set<String> filter;
12 | 
13 |     /**
14 |      * @param filter               停用词
15 |      * @param enablePorterStemming 是否分析词干（仅限英文）
16 |      */
17 |     public HanLPAnalyzer(Set<String> filter, boolean enablePorterStemming) {
18 |         this.filter = filter;
19 |         this.enablePorterStemming = enablePorterStemming;
20 |     }
21 | 
22 |     /**
23 |      * @param enablePorterStemming 是否分析词干.进行单复数,时态的转换
24 |      */
25 |     public HanLPAnalyzer(boolean enablePorterStemming) {
26 |         this.enablePorterStemming = enablePorterStemming;
27 |     }
28 | 
29 |     public HanLPAnalyzer() {
30 |         super();
31 |     }
32 | 
33 |     /**
34 |      * 重载Analyzer接口，构造分词组件
35 |      */
36 |     @Override
37 |     protected TokenStreamComponents createComponents(String fieldName) {
38 |         Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment(), filter, enablePorterStemming);
39 |         return new TokenStreamComponents(tokenizer);
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/HanLPIndexAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.hankcs.lucene;
 2 | 
 3 | import com.hankcs.hanlp.HanLP;
 4 | import org.apache.lucene.analysis.Analyzer;
 5 | import org.apache.lucene.analysis.Tokenizer;
 6 | 
 7 | import java.util.Set;
 8 | 
 9 | public class HanLPIndexAnalyzer extends Analyzer {
10 | 
11 |     private boolean pstemming;
12 |     private Set<String> filter;
13 | 
14 |     /**
15 |      * @param filter    停用词
16 |      * @param pstemming 是否分析词干
17 |      */
18 |     public HanLPIndexAnalyzer(Set<String> filter, boolean pstemming) {
19 |         this.filter = filter;
20 |         this.pstemming = pstemming;
21 |     }
22 | 
23 |     /**
24 |      * @param pstemming 是否分析词干.进行单复数,时态的转换
25 |      */
26 |     public HanLPIndexAnalyzer(boolean pstemming) {
27 |         this.pstemming = pstemming;
28 |     }
29 | 
30 |     public HanLPIndexAnalyzer() {
31 |         super();
32 |     }
33 | 
34 |     @Override
35 |     protected TokenStreamComponents createComponents(String fieldName) {
36 |         Tokenizer tokenizer = new HanLPTokenizer(HanLP.newSegment().enableIndexMode(true), filter, pstemming);
37 |         return new TokenStreamComponents(tokenizer);
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/HanLPTokenizer.java:
--------------------------------------------------------------------------------
  1 | package com.hankcs.lucene;
  2 | 
  3 | 
  4 | import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
  5 | import com.hankcs.hanlp.corpus.tag.Nature;
  6 | import com.hankcs.hanlp.seg.Segment;
  7 | import com.hankcs.hanlp.seg.common.Term;
  8 | import com.hankcs.hanlp.utility.TextUtility;
  9 | import org.apache.lucene.analysis.Tokenizer;
 10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 12 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 13 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 14 | 
 15 | import java.io.BufferedReader;
 16 | import java.io.IOException;
 17 | import java.util.Set;
 18 | 
 19 | /**
 20 |  * Tokenizer，抄袭ansj的
 21 |  */
 22 | public class HanLPTokenizer extends Tokenizer {
 23 |     // 当前词
 24 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 25 |     // 偏移量
 26 |     private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 27 |     // 距离
 28 |     private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
 29 |     private final PorterStemmer stemmer = new PorterStemmer();
 30 |     // 词性
 31 |     private TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 32 |     private SegmentWrapper segment;
 33 |     private BinTrie<String> filter;
 34 |     private boolean enablePorterStemming;
 35 |     /**
 36 |      * 单文档当前所在的总offset，当reset（切换multi-value fields中的value）的时候不清零，在end（切换field）时清零
 37 |      */
 38 |     private int totalOffset = 0;
 39 | 
 40 |     /**
 41 |      * @param segment              HanLP中的某个分词器
 42 |      * @param filter               停用词
 43 |      * @param enablePorterStemming 英文原型转换
 44 |      */
 45 |     public HanLPTokenizer(Segment segment, Set<String> filter, boolean enablePorterStemming) {
 46 |         super();
 47 |         this.segment = new SegmentWrapper(input, segment);
 48 |         if (filter != null && filter.size() > 0) {
 49 |             this.filter = new BinTrie<String>();
 50 |             for (String stopWord : filter) {
 51 |                 this.filter.put(stopWord, null);
 52 |             }
 53 |         }
 54 |         this.enablePorterStemming = enablePorterStemming;
 55 |     }
 56 | 
 57 |     @Override
 58 |     final public boolean incrementToken() throws IOException {
 59 |         clearAttributes();
 60 |         int position = 0;
 61 |         Term term;
 62 |         boolean un_increased = true;
 63 |         do { //循环过滤到干扰项字符。有标点以及空格等
 64 |             term = segment.next();
 65 |             if (term == null) {
 66 |                 break;
 67 |             }
 68 |             if (TextUtility.isBlank(term.word)) { // 过滤掉空白符，提高索引效率
 69 |                 continue;
 70 |             }
 71 |             if (enablePorterStemming && term.nature == Nature.nx) {
 72 |                 term.word = stemmer.stem(term.word);
 73 |             }
 74 | 
 75 |             if (filter != null && filter.containsKey(term.word)) {
 76 |                 continue;
 77 |             } else {
 78 |                 ++position;
 79 |                 un_increased = false;
 80 |             }
 81 |         }
 82 |         while (un_increased);
 83 | 
 84 |         if (term != null) {
 85 |             positionAttr.setPositionIncrement(position);
 86 |             termAtt.setEmpty().append(term.word);
 87 |             offsetAtt.setOffset(correctOffset(totalOffset + term.offset),
 88 |                     correctOffset(totalOffset + term.offset + term.word.length()));
 89 |             typeAtt.setType(term.nature == null ? "null" : term.nature.toString());
 90 |             return true;
 91 |         } else {
 92 |             totalOffset += segment.offset;
 93 |             return false;
 94 |         }
 95 |     }
 96 | 
 97 |     @Override
 98 |     public void end() throws IOException {
 99 |         super.end();
100 |         offsetAtt.setOffset(totalOffset, totalOffset);
101 |         totalOffset = 0;
102 |     }
103 | 
104 |     /**
105 |      * 必须重载的方法，否则在批量索引文件时将会导致文件索引失败
106 |      */
107 |     @Override
108 |     public void reset() throws IOException {
109 |         super.reset();
110 |         segment.reset(new BufferedReader(this.input));
111 |     }
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/HanLPTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package com.hankcs.lucene;
 2 | 
 3 | import com.hankcs.hanlp.HanLP;
 4 | import com.hankcs.hanlp.corpus.io.IOUtil;
 5 | import com.hankcs.hanlp.seg.Segment;
 6 | import com.hankcs.hanlp.seg.common.Term;
 7 | import com.hankcs.hanlp.tokenizer.TraditionalChineseTokenizer;
 8 | import org.apache.lucene.analysis.Tokenizer;
 9 | import org.apache.lucene.analysis.util.TokenizerFactory;
10 | import org.apache.lucene.util.AttributeFactory;
11 | 
12 | import java.util.List;
13 | import java.util.Map;
14 | import java.util.Set;
15 | import java.util.TreeSet;
16 | 
17 | public class HanLPTokenizerFactory extends TokenizerFactory {
18 |     private boolean enableIndexMode;
19 |     private boolean enablePorterStemming;
20 |     private boolean enableNumberQuantifierRecognize;
21 |     private boolean enableCustomDictionary;
22 |     private boolean enableCustomDictionaryForcing;
23 |     private boolean enableTranslatedNameRecognize;
24 |     private boolean enableJapaneseNameRecognize;
25 |     private boolean enableOrganizationRecognize;
26 |     private boolean enablePlaceRecognize;
27 |     private boolean enableNameRecognize;
28 |     private boolean enableTraditionalChineseMode;
29 |     private String algorithm;
30 |     private Set<String> stopWordDictionary;
31 | 
32 |     /**
33 |      * 初始化工厂类
34 |      *
35 |      * @param args 通过这个Map保存xml中的配置项
36 |      */
37 |     public HanLPTokenizerFactory(Map<String, String> args) {
38 |         super(args);
39 |         enableIndexMode = getBoolean(args, "enableIndexMode", true);
40 |         enablePorterStemming = getBoolean(args, "enablePorterStemming", false);
41 |         enableNumberQuantifierRecognize = getBoolean(args, "enableNumberQuantifierRecognize", false);
42 |         enableCustomDictionary = getBoolean(args, "enableCustomDictionary", true);
43 |         enableCustomDictionaryForcing = getBoolean(args, "enableCustomDictionaryForcing", true);
44 |         enableTranslatedNameRecognize = getBoolean(args, "enableTranslatedNameRecognize", false);
45 |         enableJapaneseNameRecognize = getBoolean(args, "enableJapaneseNameRecognize", false);
46 |         enableOrganizationRecognize = getBoolean(args, "enableOrganizationRecognize", false);
47 |         enableNameRecognize = getBoolean(args, "enableNameRecognize", false);
48 |         enablePlaceRecognize = getBoolean(args, "enablePlaceRecognize", false);
49 |         enableTraditionalChineseMode = getBoolean(args, "enableTraditionalChineseMode", false);
50 |         HanLP.Config.Normalization = getBoolean(args, "enableNormalization", HanLP.Config.Normalization);
51 |         algorithm = getString(args, "algorithm", "viterbi");
52 |         Set<String> customDictionaryPathSet = getSet(args, "customDictionaryPath");
53 |         if (customDictionaryPathSet != null) {
54 |             HanLP.Config.CustomDictionaryPath = customDictionaryPathSet.toArray(new String[0]);
55 |         }
56 |         String stopWordDictionaryPath = get(args, "stopWordDictionaryPath");
57 |         if (stopWordDictionaryPath != null) {
58 |             stopWordDictionary = new TreeSet<>();
59 |             stopWordDictionary.addAll(IOUtil.readLineListWithLessMemory(stopWordDictionaryPath));
60 |         }
61 |         if (getBoolean(args, "enableDebug", false)) {
62 |             HanLP.Config.enableDebug();
63 |         }
64 |     }
65 | 
66 |     protected final String getString(Map<String, String> args, String name, String defaultVal) {
67 |         String s = args.remove(name);
68 |         return s == null ? defaultVal : s;
69 |     }
70 | 
71 |     @Override
72 |     public Tokenizer create(AttributeFactory factory) {
73 |         Segment segment = HanLP.newSegment(algorithm).enableOffset(true).enableIndexMode(enableIndexMode)
74 |                 .enableNameRecognize(enableNameRecognize)
75 |                 .enableNumberQuantifierRecognize(enableNumberQuantifierRecognize)
76 |                 .enableCustomDictionary(enableCustomDictionary)
77 |                 .enableCustomDictionaryForcing(enableCustomDictionaryForcing)
78 |                 .enableTranslatedNameRecognize(enableTranslatedNameRecognize)
79 |                 .enableJapaneseNameRecognize(enableJapaneseNameRecognize)
80 |                 .enableOrganizationRecognize(enableOrganizationRecognize)
81 |                 .enablePlaceRecognize(enablePlaceRecognize);
82 |         if (enableTraditionalChineseMode) {
83 |             segment.enableIndexMode(false);
84 |             Segment inner = segment;
85 |             TraditionalChineseTokenizer.SEGMENT = inner;
86 |             segment = new Segment() {
87 |                 @Override
88 |                 protected List<Term> segSentence(char[] sentence) {
89 |                     List<Term> termList = TraditionalChineseTokenizer.segment(new String(sentence));
90 |                     return termList;
91 |                 }
92 |             };
93 |         }
94 | 
95 |         return new HanLPTokenizer(segment
96 |                 , stopWordDictionary, enablePorterStemming);
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/PorterStemmer.java:
--------------------------------------------------------------------------------
  1 | package com.hankcs.lucene;
  2 | 
  3 | import org.apache.lucene.util.ArrayUtil;
  4 | 
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | 
  9 | import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
 10 | 
 11 | /**
 12 |  * 抄袭lucene的英文处理
 13 |  * Stemmer, implementing the Porter Stemming Algorithm
 14 |  * <p/>
 15 |  * The Stemmer class transforms a word into its root form. The input word can be
 16 |  * provided a character at time (by calling add()), or at once by calling one of
 17 |  * the various stem(something) methods.
 18 |  */
 19 | 
 20 | public class PorterStemmer {
 21 |     private static final int INITIAL_SIZE = 50;
 22 |     private char[] b;
 23 |     private int i, /* offset into b */
 24 |             j, k, k0;
 25 |     private boolean dirty = false;
 26 | 
 27 |     public PorterStemmer() {
 28 |         b = new char[INITIAL_SIZE];
 29 |         i = 0;
 30 |     }
 31 | 
 32 |     /**
 33 |      * Test program for demonstrating the Stemmer. It reads a file and stems
 34 |      * each word, writing the result to standard out. Usage: Stemmer file-name
 35 |      */
 36 |     public static void main(String[] args) {
 37 |         PorterStemmer s = new PorterStemmer();
 38 | 
 39 |         for (String arg : args) {
 40 |             try (InputStream in = new FileInputStream(arg)) {
 41 |                 byte[] buffer = new byte[1024];
 42 |                 int bufferLen, offset, ch;
 43 | 
 44 |                 bufferLen = in.read(buffer);
 45 |                 offset = 0;
 46 |                 s.reset();
 47 | 
 48 |                 while (true) {
 49 |                     if (offset < bufferLen) ch = buffer[offset++];
 50 |                     else {
 51 |                         bufferLen = in.read(buffer);
 52 |                         offset = 0;
 53 |                         if (bufferLen < 0) ch = -1;
 54 |                         else ch = buffer[offset++];
 55 |                     }
 56 | 
 57 |                     if (Character.isLetter((char) ch)) {
 58 |                         s.add(Character.toLowerCase((char) ch));
 59 |                     } else {
 60 |                         s.stem();
 61 |                         System.out.print(s.toString());
 62 |                         s.reset();
 63 |                         if (ch < 0) break;
 64 |                         else {
 65 |                             System.out.print((char) ch);
 66 |                         }
 67 |                     }
 68 |                 }
 69 |             } catch (IOException e) {
 70 |                 System.out.println("error reading " + arg);
 71 |             }
 72 |         }
 73 |     }
 74 | 
 75 |     /**
 76 |      * reset() resets the stemmer so it can stem another word. If you invoke the
 77 |      * stemmer by calling add(char) and then stem(), you must call reset()
 78 |      * before starting another word.
 79 |      */
 80 |     public void reset() {
 81 |         i = 0;
 82 |         dirty = false;
 83 |     }
 84 | 
 85 |     /**
 86 |      * Add a character to the word being stemmed. When you are finished adding
 87 |      * characters, you can call stem(void) to process the word.
 88 |      */
 89 |     public void add(char ch) {
 90 |         if (b.length <= i) {
 91 |             b = ArrayUtil.grow(b, i + 1);
 92 |         }
 93 |         b[i++] = ch;
 94 |     }
 95 | 
 96 |     /**
 97 |      * After a word has been stemmed, it can be retrieved by toString(), or a
 98 |      * reference to the internal buffer can be retrieved by getResultBuffer and
 99 |      * getResultLength (which is generally more efficient.)
100 |      */
101 |     @Override
102 |     public String toString() {
103 |         return new String(b, 0, i);
104 |     }
105 | 
106 |     /**
107 |      * Returns the length of the word resulting from the stemming process.
108 |      */
109 |     public int getResultLength() {
110 |         return i;
111 |     }
112 | 
113 |     /* cons(i) is true <=> b[i] is a consonant. */
114 | 
115 |     /**
116 |      * Returns a reference to a character buffer containing the results of the
117 |      * stemming process. You also need to consult getResultLength() to determine
118 |      * the length of the result.
119 |      */
120 |     public char[] getResultBuffer() {
121 |         return b;
122 |     }
123 | 
124 |     /*
125 |      * m() measures the number of consonant sequences between k0 and j. if c is
126 |      * a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
127 |      * presence,
128 |      *
129 |      * <c><v> gives 0 <c>vc<v> gives 1 <c>vcvc<v> gives 2 <c>vcvcvc<v> gives 3
130 |      * ....
131 |      */
132 | 
133 |     private boolean cons(int i) {
134 |         switch (b[i]) {
135 |             case 'a':
136 |             case 'e':
137 |             case 'i':
138 |             case 'o':
139 |             case 'u':
140 |                 return false;
141 |             case 'y':
142 |                 return (i == k0) || !cons(i - 1);
143 |             default:
144 |                 return true;
145 |         }
146 |     }
147 | 
148 |     /* vowelinstem() is true <=> k0,...j contains a vowel */
149 | 
150 |     private int m() {
151 |         int n = 0;
152 |         int i = k0;
153 |         while (true) {
154 |             if (i > j) return n;
155 |             if (!cons(i)) break;
156 |             i++;
157 |         }
158 |         i++;
159 |         while (true) {
160 |             while (true) {
161 |                 if (i > j) return n;
162 |                 if (cons(i)) break;
163 |                 i++;
164 |             }
165 |             i++;
166 |             n++;
167 |             while (true) {
168 |                 if (i > j) return n;
169 |                 if (!cons(i)) break;
170 |                 i++;
171 |             }
172 |             i++;
173 |         }
174 |     }
175 | 
176 |     /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
177 | 
178 |     private boolean vowelinstem() {
179 |         int i;
180 |         for (i = k0; i <= j; i++)
181 |             if (!cons(i)) return true;
182 |         return false;
183 |     }
184 | 
185 |     /*
186 |      * cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
187 |      * and also if the second c is not w,x or y. this is used when trying to
188 |      * restore an e at the end of a short word. e.g.
189 |      *
190 |      * cav(e), lov(e), hop(e), crim(e), but snow, box, tray.
191 |      */
192 | 
193 |     private boolean doublec(int j) {
194 |         if (j < k0 + 1) return false;
195 |         if (b[j] != b[j - 1]) return false;
196 |         return cons(j);
197 |     }
198 | 
199 |     private boolean cvc(int i) {
200 |         if (i < k0 + 2 || !cons(i) || cons(i - 1) || !cons(i - 2)) return false;
201 |         else {
202 |             int ch = b[i];
203 |             if (ch == 'w' || ch == 'x' || ch == 'y') return false;
204 |         }
205 |         return true;
206 |     }
207 | 
208 |     /*
209 |      * setto(s) sets (j+1),...k to the characters in the string s, readjusting
210 |      * k.
211 |      */
212 | 
213 |     private boolean ends(String s) {
214 |         int l = s.length();
215 |         int o = k - l + 1;
216 |         if (o < k0) return false;
217 |         for (int i = 0; i < l; i++)
218 |             if (b[o + i] != s.charAt(i)) return false;
219 |         j = k - l;
220 |         return true;
221 |     }
222 | 
223 |     /* r(s) is used further down. */
224 | 
225 |     void setto(String s) {
226 |         int l = s.length();
227 |         int o = j + 1;
228 |         for (int i = 0; i < l; i++)
229 |             b[o + i] = s.charAt(i);
230 |         k = j + l;
231 |         dirty = true;
232 |     }
233 | 
234 |     /*
235 |      * step1() gets rid of plurals and -ed or -ing. e.g.
236 |      *
237 |      * caresses -> caress ponies -> poni ties -> ti caress -> caress cats -> cat
238 |      *
239 |      * feed -> feed agreed -> agree disabled -> disable
240 |      *
241 |      * matting -> mat mating -> mate meeting -> meet milling -> mill messing ->
242 |      * mess
243 |      *
244 |      * meetings -> meet
245 |      */
246 | 
247 |     void r(String s) {
248 |         if (m() > 0) setto(s);
249 |     }
250 | 
251 |     /* step2() turns terminal y to i when there is another vowel in the stem. */
252 | 
253 |     private void step1() {
254 |         if (b[k] == 's') {
255 |             if (ends("sses")) k -= 2;
256 |             else if (ends("ies")) setto("i");
257 |             else if (b[k - 1] != 's') k--;
258 |         }
259 |         if (ends("eed")) {
260 |             if (m() > 0) k--;
261 |         } else if ((ends("ed") || ends("ing")) && vowelinstem()) {
262 |             k = j;
263 |             if (ends("at")) setto("ate");
264 |             else if (ends("bl")) setto("ble");
265 |             else if (ends("iz")) setto("ize");
266 |             else if (doublec(k)) {
267 |                 int ch = b[k--];
268 |                 if (ch == 'l' || ch == 's' || ch == 'z') k++;
269 |             } else if (m() == 1 && cvc(k)) setto("e");
270 |         }
271 |     }
272 | 
273 |     /*
274 |      * step3() maps double suffices to single ones. so -ization ( = -ize plus
275 |      * -ation) maps to -ize etc. note that the string before the suffix must
276 |      * give m() > 0.
277 |      */
278 | 
279 |     private void step2() {
280 |         if (ends("y") && vowelinstem()) {
281 |             b[k] = 'i';
282 |             dirty = true;
283 |         }
284 |     }
285 | 
286 |     /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
287 | 
288 |     private void step3() {
289 |         if (k == k0) return; /* For Bug 1 */
290 |         switch (b[k - 1]) {
291 |             case 'a':
292 |                 if (ends("ational")) {
293 |                     r("ate");
294 |                     break;
295 |                 }
296 |                 if (ends("tional")) {
297 |                     r("tion");
298 |                     break;
299 |                 }
300 |                 break;
301 |             case 'c':
302 |                 if (ends("enci")) {
303 |                     r("ence");
304 |                     break;
305 |                 }
306 |                 if (ends("anci")) {
307 |                     r("ance");
308 |                     break;
309 |                 }
310 |                 break;
311 |             case 'e':
312 |                 if (ends("izer")) {
313 |                     r("ize");
314 |                     break;
315 |                 }
316 |                 break;
317 |             case 'l':
318 |                 if (ends("bli")) {
319 |                     r("ble");
320 |                     break;
321 |                 }
322 |                 if (ends("alli")) {
323 |                     r("al");
324 |                     break;
325 |                 }
326 |                 if (ends("entli")) {
327 |                     r("ent");
328 |                     break;
329 |                 }
330 |                 if (ends("eli")) {
331 |                     r("e");
332 |                     break;
333 |                 }
334 |                 if (ends("ousli")) {
335 |                     r("ous");
336 |                     break;
337 |                 }
338 |                 break;
339 |             case 'o':
340 |                 if (ends("ization")) {
341 |                     r("ize");
342 |                     break;
343 |                 }
344 |                 if (ends("ation")) {
345 |                     r("ate");
346 |                     break;
347 |                 }
348 |                 if (ends("ator")) {
349 |                     r("ate");
350 |                     break;
351 |                 }
352 |                 break;
353 |             case 's':
354 |                 if (ends("alism")) {
355 |                     r("al");
356 |                     break;
357 |                 }
358 |                 if (ends("iveness")) {
359 |                     r("ive");
360 |                     break;
361 |                 }
362 |                 if (ends("fulness")) {
363 |                     r("ful");
364 |                     break;
365 |                 }
366 |                 if (ends("ousness")) {
367 |                     r("ous");
368 |                     break;
369 |                 }
370 |                 break;
371 |             case 't':
372 |                 if (ends("aliti")) {
373 |                     r("al");
374 |                     break;
375 |                 }
376 |                 if (ends("iviti")) {
377 |                     r("ive");
378 |                     break;
379 |                 }
380 |                 if (ends("biliti")) {
381 |                     r("ble");
382 |                     break;
383 |                 }
384 |                 break;
385 |             case 'g':
386 |                 if (ends("logi")) {
387 |                     r("log");
388 |                     break;
389 |                 }
390 |         }
391 |     }
392 | 
393 |     /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
394 | 
395 |     private void step4() {
396 |         switch (b[k]) {
397 |             case 'e':
398 |                 if (ends("icate")) {
399 |                     r("ic");
400 |                     break;
401 |                 }
402 |                 if (ends("ative")) {
403 |                     r("");
404 |                     break;
405 |                 }
406 |                 if (ends("alize")) {
407 |                     r("al");
408 |                     break;
409 |                 }
410 |                 break;
411 |             case 'i':
412 |                 if (ends("iciti")) {
413 |                     r("ic");
414 |                     break;
415 |                 }
416 |                 break;
417 |             case 'l':
418 |                 if (ends("ical")) {
419 |                     r("ic");
420 |                     break;
421 |                 }
422 |                 if (ends("ful")) {
423 |                     r("");
424 |                     break;
425 |                 }
426 |                 break;
427 |             case 's':
428 |                 if (ends("ness")) {
429 |                     r("");
430 |                     break;
431 |                 }
432 |                 break;
433 |         }
434 |     }
435 | 
436 |     /* step6() removes a final -e if m() > 1. */
437 | 
438 |     private void step5() {
439 |         if (k == k0) return; /* for Bug 1 */
440 |         switch (b[k - 1]) {
441 |             case 'a':
442 |                 if (ends("al")) break;
443 |                 return;
444 |             case 'c':
445 |                 if (ends("ance")) break;
446 |                 if (ends("ence")) break;
447 |                 return;
448 |             case 'e':
449 |                 if (ends("er")) break;
450 |                 return;
451 |             case 'i':
452 |                 if (ends("ic")) break;
453 |                 return;
454 |             case 'l':
455 |                 if (ends("able")) break;
456 |                 if (ends("ible")) break;
457 |                 return;
458 |             case 'n':
459 |                 if (ends("ant")) break;
460 |                 if (ends("ement")) break;
461 |                 if (ends("ment")) break;
462 |                 /* element etc. not stripped before the m */
463 |                 if (ends("ent")) break;
464 |                 return;
465 |             case 'o':
466 |                 if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
467 |                 /* j >= 0 fixes Bug 2 */
468 |                 if (ends("ou")) break;
469 |                 return;
470 |             /* takes care of -ous */
471 |             case 's':
472 |                 if (ends("ism")) break;
473 |                 return;
474 |             case 't':
475 |                 if (ends("ate")) break;
476 |                 if (ends("iti")) break;
477 |                 return;
478 |             case 'u':
479 |                 if (ends("ous")) break;
480 |                 return;
481 |             case 'v':
482 |                 if (ends("ive")) break;
483 |                 return;
484 |             case 'z':
485 |                 if (ends("ize")) break;
486 |                 return;
487 |             default:
488 |                 return;
489 |         }
490 |         if (m() > 1) k = j;
491 |     }
492 | 
493 |     private void step6() {
494 |         j = k;
495 |         if (b[k] == 'e') {
496 |             int a = m();
497 |             if (a > 1 || a == 1 && !cvc(k - 1)) k--;
498 |         }
499 |         if (b[k] == 'l' && doublec(k) && m() > 1) k--;
500 |     }
501 | 
502 |     /**
503 |      * Stem a word provided as a String. Returns the result as a String.
504 |      */
505 |     public String stem(String s) {
506 |         if (stem(s.toCharArray(), s.length())) return toString();
507 |         else return s;
508 |     }
509 | 
510 |     /**
511 |      * Stem a word contained in a char[]. Returns true if the stemming process
512 |      * resulted in a word different from the input. You can retrieve the result
513 |      * with getResultLength()/getResultBuffer() or toString().
514 |      */
515 |     public boolean stem(char[] word) {
516 |         return stem(word, word.length);
517 |     }
518 | 
519 |     /**
520 |      * Stem a word contained in a portion of a char[] array. Returns true if the
521 |      * stemming process resulted in a word different from the input. You can
522 |      * retrieve the result with getResultLength()/getResultBuffer() or
523 |      * toString().
524 |      */
525 |     public boolean stem(char[] wordBuffer, int offset, int wordLen) {
526 |         reset();
527 |         if (b.length < wordLen) {
528 |             b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
529 |         }
530 |         System.arraycopy(wordBuffer, offset, b, 0, wordLen);
531 |         i = wordLen;
532 |         return stem(0);
533 |     }
534 | 
535 |     /**
536 |      * Stem a word contained in a leading portion of a char[] array. Returns
537 |      * true if the stemming process resulted in a word different from the input.
538 |      * You can retrieve the result with getResultLength()/getResultBuffer() or
539 |      * toString().
540 |      */
541 |     public boolean stem(char[] word, int wordLen) {
542 |         return stem(word, 0, wordLen);
543 |     }
544 | 
545 |     /**
546 |      * Stem the word placed into the Stemmer buffer through calls to add().
547 |      * Returns true if the stemming process resulted in a word different from
548 |      * the input. You can retrieve the result with
549 |      * getResultLength()/getResultBuffer() or toString().
550 |      */
551 |     public boolean stem() {
552 |         return stem(0);
553 |     }
554 | 
555 |     public boolean stem(int i0) {
556 |         k = i - 1;
557 |         k0 = i0;
558 |         if (k > k0 + 1) {
559 |             step1();
560 |             step2();
561 |             step3();
562 |             step4();
563 |             step5();
564 |             step6();
565 |         }
566 |         // Also, a word is considered dirty if we lopped off letters
567 |         // Thanks to Ifigenia Vairelles for pointing this out.
568 |         if (i != k + 1) dirty = true;
569 |         i = k + 1;
570 |         return dirty;
571 |     }
572 | 
573 | }
574 | 


--------------------------------------------------------------------------------
/src/main/java/com/hankcs/lucene/SegmentWrapper.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * <summary></summary>
  3 |  * <author>hankcs</author>
  4 |  * <email>me@hankcs.com</email>
  5 |  * <create-date>2015/10/6 18:51</create-date>
  6 |  *
  7 |  * <copyright file="SegmentWrapper.java">
  8 |  * Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/
  9 |  * </copyright>
 10 |  */
 11 | package com.hankcs.lucene;
 12 | 
 13 | import com.hankcs.hanlp.seg.Segment;
 14 | import com.hankcs.hanlp.seg.common.Term;
 15 | 
 16 | import java.io.IOException;
 17 | import java.io.Reader;
 18 | import java.util.HashSet;
 19 | import java.util.Iterator;
 20 | import java.util.List;
 21 | import java.util.Set;
 22 | 
 23 | /**
 24 |  * 将分词器包装起来，每次输出一个token
 25 |  *
 26 |  * @author hankcs
 27 |  */
 28 | public class SegmentWrapper {
 29 |     /**
 30 |      * 缓冲区大小
 31 |      */
 32 |     private static final int BUFFER_SIZE = 512;
 33 |     /**
 34 |      * 句子分隔符
 35 |      */
 36 |     private static final Set<Character> delimiterCharSet = new HashSet<Character>() {{
 37 |         add('\r');
 38 |         add('\n');
 39 |         add(';');
 40 |         add('；');
 41 |         add('。');
 42 |         add('!');
 43 |         add('！');
 44 |     }};
 45 |     /**
 46 |      * term的偏移量，由于wrapper是按行读取的，必须对term.offset做一个校正
 47 |      */
 48 |     int offset;
 49 |     /**
 50 |      * 输入
 51 |      */
 52 |     private Reader input;
 53 |     /**
 54 |      * 分词器
 55 |      */
 56 |     private Segment segment;
 57 |     /**
 58 |      * 分词结果
 59 |      */
 60 |     private Iterator<Term> iterator;
 61 |     /**
 62 |      * 缓冲区
 63 |      */
 64 |     private char[] buffer = new char[BUFFER_SIZE];
 65 |     /**
 66 |      * 缓冲区未处理的下标
 67 |      */
 68 |     private int remainSize = 0;
 69 | 
 70 |     public SegmentWrapper(Reader reader, Segment segment) {
 71 |         this.input = reader;
 72 |         this.segment = segment;
 73 |     }
 74 | 
 75 |     /**
 76 |      * 重置分词器
 77 |      *
 78 |      * @param reader
 79 |      */
 80 |     public void reset(Reader reader) {
 81 |         input = reader;
 82 |         offset = 0;
 83 |         iterator = null;
 84 |     }
 85 | 
 86 |     public Term next() throws IOException {
 87 |         if (iterator != null && iterator.hasNext()) return iterator.next();
 88 |         String line = readLine();
 89 |         if (line == null) return null;
 90 |         List<Term> termList = segment.seg(line);
 91 |         if (termList.size() == 0) return null;
 92 |         for (Term term : termList) {
 93 |             term.offset += offset;
 94 |         }
 95 |         offset += line.length();
 96 |         iterator = termList.iterator();
 97 |         return iterator.next();
 98 |     }
 99 | 
100 |     private String readLine() throws IOException {
101 |         int offset = 0;
102 |         int length = BUFFER_SIZE;
103 |         if (remainSize > 0) {
104 |             offset = remainSize;
105 |             length -= remainSize;
106 |         }
107 |         int n = input.read(buffer, offset, length);
108 |         if (n < 0) {
109 |             if (remainSize != 0) {
110 |                 String lastLine = new String(buffer, 0, remainSize);
111 |                 remainSize = 0;
112 |                 return lastLine;
113 |             }
114 |             return null;
115 |         }
116 |         n += offset;
117 | 
118 |         int eos = lastIndexOfEos(buffer, n);
119 |         String line = new String(buffer, 0, eos);
120 |         remainSize = n - eos;
121 |         System.arraycopy(buffer, eos, buffer, 0, remainSize);
122 |         return line;
123 |     }
124 | 
125 |     private int lastIndexOfEos(char[] buffer, int length) {
126 |         for (int i = length - 1; i > 0; i--) {
127 |             if (delimiterCharSet.contains(buffer[i])) {
128 |                 return i + 1;
129 |             }
130 |         }
131 |         return length;
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/CloudApplication.java:
--------------------------------------------------------------------------------
 1 | package com.watt;
 2 | 
 3 | import org.springframework.boot.SpringApplication;
 4 | import org.springframework.boot.autoconfigure.SpringBootApplication;
 5 | 
 6 | @SpringBootApplication
 7 | public class CloudApplication {
 8 |     public static void main(String[] args) {
 9 |         SpringApplication.run(CloudApplication.class, args);
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/configure/DbConfig.java:
--------------------------------------------------------------------------------
 1 | package com.watt.configure;
 2 | 
 3 | import org.springframework.boot.context.properties.ConfigurationProperties;
 4 | import org.springframework.context.annotation.Configuration;
 5 | 
 6 | @Configuration
 7 | @ConfigurationProperties(prefix = "db.mysql")
 8 | public class DbConfig {
 9 |     private String driverClass;
10 |     private String jdbcUrl;
11 |     private String user;
12 |     private String password;
13 | 
14 |     public String getDriverClass() {
15 |         return driverClass;
16 |     }
17 | 
18 |     public void setDriverClass(String driverClass) {
19 |         this.driverClass = driverClass;
20 |     }
21 | 
22 |     public String getJdbcUrl() {
23 |         return jdbcUrl;
24 |     }
25 | 
26 |     public void setJdbcUrl(String jdbcUrl) {
27 |         this.jdbcUrl = jdbcUrl;
28 |     }
29 | 
30 |     public String getUser() {
31 |         return user;
32 |     }
33 | 
34 |     public void setUser(String user) {
35 |         this.user = user;
36 |     }
37 | 
38 |     public String getPassword() {
39 |         return password;
40 |     }
41 | 
42 |     public void setPassword(String password) {
43 |         this.password = password;
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/configure/LuceneConfig.java:
--------------------------------------------------------------------------------
 1 | package com.watt.configure;
 2 | 
 3 | import org.apache.lucene.index.DirectoryReader;
 4 | import org.apache.lucene.search.IndexSearcher;
 5 | import org.apache.lucene.store.Directory;
 6 | import org.apache.lucene.store.FSDirectory;
 7 | import org.slf4j.Logger;
 8 | import org.slf4j.LoggerFactory;
 9 | import org.springframework.boot.context.properties.ConfigurationProperties;
10 | import org.springframework.context.annotation.Configuration;
11 | 
12 | import java.io.IOException;
13 | import java.nio.file.FileSystems;
14 | 
15 | @Configuration
16 | @ConfigurationProperties(prefix = "lucene")
17 | public class LuceneConfig {
18 |     private static Directory directory;
19 |     private String root;
20 |     private static DirectoryReader reader;
21 |     private final Logger logger = LoggerFactory.getLogger(this.getClass());
22 |     private String indexKey;
23 |     private String vectorPath;
24 |     private String tfidfPath;
25 | 
26 |     private Directory getDirectory() {
27 |             try {
28 |                 directory = FSDirectory.open(FileSystems.getDefault().getPath(root));
29 |             } catch (IOException e) {
30 |                 logger.error("directory对象打开失败");
31 |                 return null;
32 |             }
33 |         return directory;
34 |     }
35 | 
36 |     public IndexSearcher getIndexSearcher() {
37 |         Directory directory = getDirectory();
38 |         if (directory == null) {
39 |             return null;
40 |         }
41 |         try {
42 |                 reader = DirectoryReader.open(directory);
43 |                 DirectoryReader tr = DirectoryReader.openIfChanged(reader);
44 |                 if (tr != null) {
45 |                     reader.close();
46 |                     reader = tr;
47 |                 }
48 | 
49 |             return new IndexSearcher(reader);
50 |         } catch (IOException e) {
51 |             logger.error("indexReader打开失败，不能继续");
52 |         }
53 |         return null;
54 |     }
55 | 
56 |     public String getRoot() {
57 |         return root;
58 |     }
59 | 
60 |     public void setRoot(String root) {
61 |         this.root = root;
62 |     }
63 | 
64 |     public String getIndexKey() {
65 |         return indexKey;
66 |     }
67 | 
68 |     public void setIndexKey(String indexKey) {
69 |         this.indexKey = indexKey;
70 |     }
71 | 
72 |     public String getVectorPath() {
73 |         return vectorPath;
74 |     }
75 | 
76 |     public void setVectorPath(String vectorPath) {
77 |         this.vectorPath = vectorPath;
78 |     }
79 | 
80 |     public String getTfidfPath() {
81 |         return tfidfPath;
82 |     }
83 | 
84 |     public void setTfidfPath(String tfidfPath) {
85 |         this.tfidfPath = tfidfPath;
86 |     }
87 | }
88 | 
89 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/configure/MybatisConfig.java:
--------------------------------------------------------------------------------
 1 | package com.watt.configure;
 2 | 
 3 | import org.springframework.boot.context.properties.ConfigurationProperties;
 4 | import org.springframework.context.annotation.Configuration;
 5 | 
 6 | @Configuration
 7 | @ConfigurationProperties(prefix = "db.mybatis")
 8 | public class MybatisConfig {
 9 |     private String mybatisXml;
10 | 
11 |     public String getMybatisXml() {
12 |         return mybatisXml;
13 |     }
14 | 
15 |     public void setMybatisXml(String mybatisXml) {
16 |         this.mybatisXml = mybatisXml;
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/QuestionsIndex.java:
--------------------------------------------------------------------------------
  1 | package com.watt.core;
  2 | 
  3 | import com.hankcs.hanlp.HanLP;
  4 | import com.hankcs.hanlp.corpus.synonym.Synonym;
  5 | import com.hankcs.hanlp.dictionary.CoreSynonymDictionary;
  6 | import com.hankcs.hanlp.dictionary.common.CommonSynonymDictionary;
  7 | import com.hankcs.hanlp.seg.Segment;
  8 | import com.hankcs.hanlp.seg.common.Term;
  9 | import com.hankcs.lucene.HanLPIndexAnalyzer;
 10 | import com.watt.configure.LuceneConfig;
 11 | import com.watt.mvc.service.QAService;
 12 | import com.watt.util.FileUtils;
 13 | import org.apache.lucene.document.Document;
 14 | import org.apache.lucene.document.Field;
 15 | import org.apache.lucene.document.TextField;
 16 | import org.apache.lucene.index.IndexWriter;
 17 | import org.apache.lucene.index.IndexWriterConfig;
 18 | import org.apache.lucene.store.Directory;
 19 | import org.apache.lucene.store.FSDirectory;
 20 | import org.slf4j.Logger;
 21 | import org.slf4j.LoggerFactory;
 22 | import org.springframework.beans.factory.annotation.Autowired;
 23 | import org.springframework.stereotype.Component;
 24 | 
 25 | import java.io.IOException;
 26 | import java.nio.file.FileSystems;
 27 | import java.util.List;
 28 | import java.util.Map;
 29 | import java.util.concurrent.atomic.AtomicInteger;
 30 | 
 31 | /**
 32 |  * lucene索引创建保存相关
 33 |  */
 34 | @Component
 35 | public class QuestionsIndex {
 36 |     private final Logger logger = LoggerFactory.getLogger(this.getClass());
 37 |     private final QAService qaService;
 38 |     private final LuceneConfig luceneConfig;
 39 |     private Segment segment = HanLP.newSegment();
 40 | 
 41 |     @Autowired
 42 |     public QuestionsIndex(QAService qaService, LuceneConfig luceneConfig) {
 43 |         this.qaService = qaService;
 44 |         this.luceneConfig = luceneConfig;
 45 |     }
 46 | 
 47 |     /**
 48 |      * 初始化,所以库
 49 |      */
 50 |     public void createIndex() {
 51 | 
 52 |         FileUtils.clearPath(luceneConfig.getRoot());
 53 |         IndexWriter writer;
 54 |         Directory directory;
 55 |         IndexWriterConfig iwc = new IndexWriterConfig(new HanLPIndexAnalyzer());
 56 |         //创建目录 directory
 57 |         try {
 58 |             directory = FSDirectory.open(FileSystems.getDefault().getPath(luceneConfig.getRoot()));
 59 |             writer = new IndexWriter(directory, iwc);
 60 |         } catch (IOException e) {
 61 |             logger.info("Lucene目录打开异常");
 62 |             return;
 63 |         }
 64 | 
 65 |         int start = 0;
 66 |         int pageSize = 1000;
 67 |         List<Map<String, String>> words;
 68 |         AtomicInteger count = new AtomicInteger();
 69 |         do {
 70 |             words = qaService.queryQuestions(start, pageSize);
 71 |             for (Map<String, String> word : words) {
 72 |                 if (word != null && !word.isEmpty()) {
 73 |                     Document doc = new Document();//创建document 添加field field是document的子元素
 74 |                     doc.add(new Field("questions", word.get("QUESTION"), TextField.TYPE_STORED));
 75 |                     doc.add(new Field(luceneConfig.getIndexKey(), addSynonymItems(word.get("QUESTION")), TextField.TYPE_STORED));
 76 |                     doc.add(new Field("key", word.get("KW_ID"), TextField.TYPE_STORED));
 77 |                     doc.add(new Field("questionID", word.get("QUESTION_ID"), TextField.TYPE_STORED));
 78 |                     try {
 79 |                         writer.addDocument(doc);
 80 |                     } catch (IOException e) {
 81 |                         logger.info("写入所以出错QUESTION：" + word.get("QUESTION") + "\nID:" + word.get("KW_ID"));
 82 |                     }
 83 |                     count.getAndIncrement();
 84 |                 }
 85 |             }
 86 |             logger.info("成功加载问题库：" + count.doubleValue());
 87 |             start += pageSize;
 88 |         } while (!words.isEmpty());
 89 |         try {
 90 |             writer.close();
 91 |             directory.close();
 92 |         } catch (IOException e) {
 93 |             logger.info("Lucene目录和writer关闭异常");
 94 |         }
 95 | 
 96 |     }
 97 | 
 98 |     private String addSynonymItems(String word) {
 99 |         List<Term> termList = segment.seg(word);
100 |         StringBuffer result = new StringBuffer();
101 |         for (Term term : termList) {
102 |             result.append(term.word);
103 |             CommonSynonymDictionary.SynonymItem item = CoreSynonymDictionary.get(term.word);
104 |             if (item != null && item.type == Synonym.Type.EQUAL && item.synonymList != null && !item.synonymList.isEmpty()) {
105 |                 for (Synonym synonym : item.synonymList) {
106 |                     result.append(synonym.realWord);
107 |                 }
108 |             }
109 |         }
110 |         return result.toString();
111 |     }
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/dictionary/CoreAbbreviationDictionary.java:
--------------------------------------------------------------------------------
 1 | package com.watt.core.dictionary;
 2 | 
 3 | import com.hankcs.hanlp.seg.common.Term;
 4 | 
 5 | import java.util.HashMap;
 6 | import java.util.List;
 7 | import java.util.Map;
 8 | 
 9 | /**
10 |  * 全、简称词典核心类
11 |  */
12 | public class CoreAbbreviationDictionary {
13 |     private static Map<String, String> abbreviation = new HashMap<>();
14 | 
15 |     public static String getAbbreviation(String abbr) {
16 |         return abbreviation.get(abbr);
17 |     }
18 | 
19 |     public static void addAbbreviation(String abbr, String full) {
20 |         abbreviation.put(abbr, full);
21 |     }
22 | 
23 |     /**
24 |      * 将简称全部转为全称
25 |      */
26 |     public static List<Term> convertAbbreviationToFull(List<Term> terms) {
27 |         terms.forEach(term -> {
28 |             String full = getAbbreviation(term.word);
29 |             if (full != null) {
30 |                 term.word = full;
31 |             }
32 |         });
33 |         return terms;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/dictionary/CoreStopWordsDictionary.java:
--------------------------------------------------------------------------------
 1 | package com.watt.core.dictionary;
 2 | 
 3 | import com.hankcs.hanlp.seg.common.Term;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.HashSet;
 7 | import java.util.List;
 8 | import java.util.Set;
 9 | 
10 | public class CoreStopWordsDictionary {
11 |     private static Set<String> stopWords = new HashSet<>();
12 | 
13 |     public static void addStopWord(String word) {
14 |         stopWords.add(word);
15 |     }
16 | 
17 |     private static boolean contains(String word) {
18 |         return stopWords.contains(word);
19 |     }
20 | 
21 |     /**
22 |      * 去掉所有的停用词
23 |      *
24 |      * @return 去掉停用词后保留原始数据
25 |      */
26 |     public static List<Term> removeStopWords(List<Term> terms) {
27 |         List<Term> result = new ArrayList<>();
28 |         terms.forEach(term -> {
29 |             if (!contains(term.word)) {
30 |                 result.add(term);
31 |             }
32 |         });
33 |         return result;
34 |     }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/dictionary/MyCustomDictionary.java:
--------------------------------------------------------------------------------
 1 | package com.watt.core.dictionary;
 2 | 
 3 | import com.hankcs.hanlp.dictionary.CustomDictionary;
 4 | import com.watt.mvc.service.QAService;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | import org.springframework.beans.factory.annotation.Autowired;
 8 | import org.springframework.stereotype.Component;
 9 | 
10 | import java.util.List;
11 | import java.util.Map;
12 | 
13 | /**
14 |  * 词典相关的内容
15 |  */
16 | @Component
17 | public class MyCustomDictionary {
18 |     private final Logger logger = LoggerFactory.getLogger(this.getClass());
19 |     private final QAService qaService;
20 | 
21 |     @Autowired
22 |     public MyCustomDictionary(QAService qaService) {
23 |         this.qaService = qaService;
24 |     }
25 | 
26 |     public void initDictionary() {
27 |         List<Map<String, String>> words = qaService.queryDictionaryAll();
28 |         words.forEach(word -> {
29 |             if (word != null && !word.isEmpty()) {
30 |                 CustomDictionary.add(word.get("WORD"));
31 |             }
32 |         });
33 |         logger.info("共加载自定义词典：" + words.size());
34 |     }
35 | 
36 |     public void initCiLinSynonyms() {
37 |         List<String> synonyms = qaService.querySynonyms();
38 |         //CoreSynonymDictionary.reload(synonyms);
39 |         logger.info("共加载同义词词汇：" + synonyms.size());
40 |     }
41 | 
42 |     public void initStopWords() {
43 |         List<String> stopWords = qaService.queryStopWordsAll();
44 |         stopWords.forEach(CoreStopWordsDictionary::addStopWord);
45 |         logger.info("共加载停用词词典：" + stopWords.size());
46 |     }
47 | 
48 |     /**
49 |      * 初始化全、简称词典，将简称全部加入到词典中去
50 |      */
51 |     public void initAbbreviation() {
52 |         List<Map<String, String>> abbreviations = qaService.queryAbbreviation(null);
53 |         abbreviations.forEach(abbreviation -> {
54 |             CustomDictionary.add(abbreviation.get("abbr_name"));
55 |             CoreAbbreviationDictionary.addAbbreviation(abbreviation.get("abbr_name"), abbreviation.get("full_name"));
56 |         });
57 |         logger.info("共加载全、简称词典：" + abbreviations.size());
58 |     }
59 | 
60 |     /**
61 |      * 添加税务专用名词词典,永久生效
62 |      *
63 |      * @param word 词条或语料
64 |      */
65 |     public void addTaxDictionaryWord(String word) {
66 |         qaService.addTaxDictionaryWord(word);
67 |         CustomDictionary.add(word);
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/nlp/cosinesimlarity/AtomSegment.java:
--------------------------------------------------------------------------------
 1 | package com.watt.core.nlp.cosinesimlarity;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | public class AtomSegment {
 7 |     public AtomSegment() {
 8 |     }
 9 | 
10 |     public static String atomSegment(String sentence) {
11 |         String atomSegResult = "";
12 |         Map<Integer, String> wsWordMap = IDExtract.getLetters(sentence);
13 |         Map<Integer, String> mWordMap = IDExtract.getNumbers(sentence);
14 |         Map<Integer, String> wordsMap = new HashMap();
15 |         wordsMap.putAll(wsWordMap);
16 |         wordsMap.putAll(mWordMap);
17 |         int senLength = sentence.length();
18 | 
19 |         for(int i = 0; i < senLength; ++i) {
20 |             String word_i = (String)wordsMap.get(i);
21 |             if (word_i == null) {
22 |                 word_i = sentence.charAt(i) + "";
23 |                 wordsMap.put(i, word_i);
24 |             } else {
25 |                 i += word_i.length() - 1;
26 |             }
27 | 
28 |             atomSegResult = atomSegResult + " " + word_i;
29 |         }
30 | 
31 |         return atomSegResult;
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/nlp/cosinesimlarity/IDExtract.java:
--------------------------------------------------------------------------------
  1 | //
  2 | // Source code recreated from a .class file by IntelliJ IDEA
  3 | // (powered by Fernflower decompiler)
  4 | //
  5 | 
  6 | package com.watt.core.nlp.cosinesimlarity;
  7 | 
  8 | import java.util.ArrayList;
  9 | import java.util.Map;
 10 | import java.util.TreeMap;
 11 | import java.util.regex.Matcher;
 12 | import java.util.regex.Pattern;
 13 | 
 14 | public class IDExtract {
 15 |     public IDExtract() {
 16 |     }
 17 | 
 18 |     private static Map<Integer, String> getStr(String regex_code, String param) {
 19 |         if (regex_code != null && !"".equals(regex_code)) {
 20 |             if (param != null && !"".equals(param)) {
 21 |                 new ArrayList();
 22 |                 Map<Integer, String> map = new TreeMap();
 23 |                 Pattern p = Pattern.compile(regex_code);
 24 |                 Matcher m = p.matcher(param);
 25 | 
 26 |                 while(m.find()) {
 27 |                     map.put(m.start(), m.group());
 28 |                 }
 29 | 
 30 |                 return map;
 31 |             } else {
 32 |                 return null;
 33 |             }
 34 |         } else {
 35 |             return null;
 36 |         }
 37 |     }
 38 | 
 39 |     public static Map<Integer, String> getLetters(String param) {
 40 |         String regex_code = "[A-Za-z]+";
 41 |         return getStr(regex_code, param);
 42 |     }
 43 | 
 44 |     public static Map<Integer, String> getNumbers(String param) {
 45 |         String regex_code = "(\\+|\\-)?\\d+(\\.\\d+)?";
 46 |         return getStr(regex_code, param);
 47 |     }
 48 | 
 49 |     public static Map<Integer, String> getEmail(String param) {
 50 |         String regex_code = "([a-zA-Z_]{1,}[0-9]{0,}@(([a-zA-z0-9]-*){1,}\\.){1,3}[a-zA-z\\-]{1,})|([1-9]\\d{4,10}@qq.com)";
 51 |         return getStr(regex_code, param);
 52 |     }
 53 | 
 54 |     public static Map<Integer, String> getMobile(String param) {
 55 |         String regex_code = "(?<!\\d)(?:(?:1[345678]\\d{9})|(?:861[345678]\\d{9}))(?!\\d)";
 56 |         return getStr(regex_code, param);
 57 |     }
 58 | 
 59 |     public static Map<Integer, String> getTelNumber(String param) {
 60 |         String regex_code = "((\\(0[1-9][0-9]{1,2}\\))|((?<!\\d)0[1-9][0-9]{1,2}))?[-—]?((?<!\\d)[1-9][0-9]{6,7}(?!\\d))";
 61 |         return getStr(regex_code, param);
 62 |     }
 63 | 
 64 |     public static Map<Integer, String> getIDCard(String param) {
 65 |         String regex_code = "(\\d{6})(18|19|20)?((\\d{2})|(\\*\\*)|(XX)|(xx)|(\\?\\?)|(\\？\\？))(([01]\\d)|(\\*\\*)|(XX)|(xx)|(\\?\\?)|(\\？\\？))(([0123]\\d)|(\\*\\*)|(XX)|(xx)|(\\?\\?)|(\\？\\？))(((\\d{3})(\\d|X|x)?)|(\\*\\*\\*\\*)|(XXXX)|(xxxx)|(\\?\\?\\?\\?)|(\\？\\？\\？\\？))";
 66 |         return getStr(regex_code, param);
 67 |     }
 68 | 
 69 |     public static Map<Integer, String> getIPAddr(String param) {
 70 |         String regex_code = "(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])";
 71 |         return getStr(regex_code, param);
 72 |     }
 73 | 
 74 |     public static Map<Integer, String> getQQ(String param) {
 75 |         String regex_code = "[1-9]\\d{4,10}";
 76 |         return getStr(regex_code, param);
 77 |     }
 78 | 
 79 |     public static Map<Integer, String> getTime(String param) {
 80 |         String regex_code1 = "((\\d{4}-\\d{1,2}-\\d{1,2})|(\\d{2,4}\\u5E74\\d{1,2}\\u6708\\d{1,2}\\u65E5))";
 81 |         String regex_code2 = "((\\d{2}:\\d{2}:\\d{2})|(((\\d{1,2}\\u65F6)|(\\d{1,2}\\u70b9))(\\d{1,2}\\u5206(\\d{1,2}\\u79D2)?)?))";
 82 |         String regex_code3 = "((\\d{4}-\\d{1,2}-\\d{1,2})|(\\d{2,4}\\u5E74\\d{1,2}\\u6708\\d{1,2}\\u65E5)|(\\d{2,4}\\u5E74\\d{1,2}\\u6708)|\\d{1,2}\\u6708\\d{1,2}\\u65E5|\\d{2,4}\\u5E74|\\d{1,2}\\u6708|\\d{1,2}\\u65E5)";
 83 |         String regex_code4 = "(\\d{2}:\\d{2}:\\d{2})|(\\d{2}:\\d{2})|(((\\d{1,2}\\u65F6)|(\\d{1,2}\\u70b9))\\d{1,2}\\u5206(\\d{1,2}\\u79D2)?)|(\\d{1,2}\\u65F6)";
 84 |         String regex_code = regex_code1 + "(\\s)?" + regex_code2 + "|" + regex_code3 + "|" + regex_code4;
 85 |         return getStr(regex_code, param);
 86 |     }
 87 | 
 88 |     public static Map<Integer, String> getCNTime(String param) {
 89 |         String regex_code1 = "(([零○一二两三四五六七八九十百千万亿]{2,}\\u5E74([一二三四五六七八九十]|(十(一|二)))\\u6708([一二三四五六七八九十]{1,})\\u65E5)((\\s)?((([零○一二两三四五六七八九十]{1,}\\u65F6)|([零○一二两三四五六七八九十]{1,}\\u70b9))[零○一二两三四五六七八九十]{1,}\\u5206[零○一二两三四五六七八九十]{1,}\\u79D2))*)";
 90 |         String regex_code2 = "(([零○一二两三四五六七八九十百千万亿]{2,}\\u5E74([一二三四五六七八九十]|(十(一|二)))\\u6708([一二三四五六七八九十]{1,})\\u65E5)*((\\s)?((([零○一二两三四五六七八九十]{1,}\\u65F6)|([零○一二两三四五六七八九十]{1,}\\u70b9))[零○一二两三四五六七八九十]{1,}\\u5206[零○一二两三四五六七八九十]{1,}\\u79D2)))";
 91 |         String regex_code3 = "((([零○一二两三四五六七八九十百千万亿]{2,}\\u5E74([一二三四五六七八九十]|(十(一|二)))\\u6708)|(([一二三四五六七八九十]|(十(一|二)))\\u6708([一二三四五六七八九十]{1,})\\u65E5)|([(零|○)一(二|两)三四五六七八九十百千万亿]{2,}\\u5E74)|(([一二三四五六七八九十]|(十(一|二)))\\u6708)|(([一二三四五六七八九十]{1,})\\u65E5)))";
 92 |         String regex_code4 = "((([零○一二两三四五六七八九十]{1,}\\u65F6)|([零○一二两三四五六七八九十]{1,}\\u70b9))[零○一二两三四五六七八九十]{1,}\\u5206|(([零○一二两三四五六七八九十]{1,}\\u65F6)|([零○一二两三四五六七八九十]{1,}\\u70b9)))";
 93 |         String regex_code = regex_code1 + "|" + regex_code2 + "|" + regex_code3 + "|" + regex_code4;
 94 |         return getStr(regex_code, param);
 95 |     }
 96 | 
 97 |     public static Map<Integer, String> getURL(String param) {
 98 |         String regex_code = "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]";
 99 |         return getStr(regex_code, param);
100 |     }
101 | 
102 |     public static Map<Integer, String> getCarNum(String param) {
103 |         String regex_code = "[京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领A-Z]{1}[A-Z]{1}[A-Z0-9]{4}[A-Z0-9挂学警港澳]{1}";
104 |         return getStr(regex_code, param);
105 |     }
106 | 
107 |     public static Map<Integer, String> getBankCard(String param) {
108 |         String regex_code = "(\\d{16}|\\d{19})";
109 |         return getStr(regex_code, param);
110 |     }
111 | 
112 |     public static Map<Integer, String> getCNNum(String param) {
113 |         String regex_code = "(第[零○一二两三四五六七八九十廿卅百千万亿]{1,})|((几|数)(十|百|千|万|(十万)|(百万)|(千万)|亿))|((成|上)(百|千|万|(十万)|(百万)|(千万)|亿))|([零○一二两三四五六七八九十廿卅百千万亿]{2,})|([零壹贰叁肆伍陆柒捌玖拾佰仟万亿]{2,})";
114 |         return getStr(regex_code, param);
115 |     }
116 | 
117 |     public static Map<Integer, String> getPerNum(String param) {
118 |         String regex_code1 = "((\\d{1,})(\\.\\d{1,})?%|(百分之(([○零一二两三四五六七八九十廿卅百]{1,})(点[○零一二两三四五六七八九十廿卅]{1,})?)|((\\d{1,})(\\.\\d{1,})?)))";
119 |         String regex_code2 = "((\\d{1,})(\\.\\d{1,})?‰|(千分之(([○零一二两三四五六七八九十廿卅百千]{1,})(点[○零一二两三四五六七八九十廿卅]{1,})?))|((\\d{1,})(\\.\\d{1,})?))";
120 |         String regex_code3 = "((([○零一二两三四五六七八九十廿卅百千万亿]{1,})|(\\d{1,}))分之(([○零一二两三四五六七八九十廿卅百千万亿]{1,})|(\\d{1,})))";
121 |         String regex_code = regex_code3 + "|" + regex_code1 + "|" + regex_code2;
122 |         return getStr(regex_code, param);
123 |     }
124 | 
125 |     public static Map<Integer, String> getFloatNum(String param) {
126 |         String regex_code = "([1-9]\\d*\\.\\d*|0\\.\\d*[1-9]\\d*)";
127 |         return getStr(regex_code, param);
128 |     }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/nlp/cosinesimlarity/SimilarityAnalyze.java:
--------------------------------------------------------------------------------
  1 | //
  2 | // Source code recreated from a .class file by IntelliJ IDEA
  3 | // (powered by Fernflower decompiler)
  4 | //
  5 | 
  6 | package com.watt.core.nlp.cosinesimlarity;
  7 | 
  8 | import com.hankcs.hanlp.seg.common.Term;
  9 | 
 10 | import java.io.IOException;
 11 | import java.util.List;
 12 | 
 13 | public abstract class SimilarityAnalyze {
 14 |     Word2Vec vec = new Word2Vec();
 15 |     boolean loadModel;
 16 | 
 17 |     public Word2Vec getVec() {
 18 |         return vec;
 19 |     }
 20 | 
 21 |     public void setVec(Word2Vec vec) {
 22 |         this.vec = vec;
 23 |     }
 24 | 
 25 |     public void loadGoogleModel(String filePath) {
 26 |         try {
 27 |             this.vec.loadGoogleModel(filePath);
 28 |         } catch (IOException e) {
 29 |             e.printStackTrace();
 30 |         }
 31 |         this.loadModel = true;
 32 |     }
 33 | 
 34 |     public void loadCommonModel(String filePath) {
 35 |         try {
 36 |             this.vec.loadCommonModel(filePath);
 37 |         } catch (IOException e) {
 38 |             e.printStackTrace();
 39 |         }
 40 |         this.loadModel = true;
 41 |     }
 42 | 
 43 |     public void loadJavaModel(String filePath) {
 44 |         try {
 45 |             this.vec.loadJavaModel(filePath);
 46 |         } catch (IOException var3) {
 47 |             var3.printStackTrace();
 48 |         }
 49 |         this.loadModel = true;
 50 |     }
 51 | 
 52 | 
 53 |     float[] getWordVector(String word) {
 54 |         return !this.loadModel ? null : this.vec.getWordVector(word);
 55 |     }
 56 | 
 57 |     /**
 58 |      * 计算两个向量之间的余弦相似度
 59 |      *
 60 |      * @param vec1 向量1
 61 |      * @param vec2 向量2
 62 |      * @return 相似度值
 63 |      */
 64 |     double calCosine(float[] vec1, float[] vec2) {
 65 |         double dist = 0.0;
 66 |         double sum1 = 0.0;
 67 |         double sum2 = 0.0;
 68 |         if (vec1.length != vec2.length) {
 69 |             return dist;
 70 |         }
 71 |         for (int i = 0; i < vec1.length; ++i) {
 72 |             dist += vec1[i] * vec2[i];
 73 |             sum1 += Math.pow(vec1[i], 2);
 74 |             sum2 += Math.pow(vec2[i], 2);
 75 |         }
 76 |         double result = dist / Math.sqrt(sum1 * sum2);
 77 |         //在计算过程中，向量一致的词由于算法的不完全匹配性，存在比较小的误差，
 78 |         //为避免大于1这种情况，对后续计算过程中的英雄，暂时的将相似度控制到100%以内
 79 |         return result > 1.0 ? 1.0D : result;
 80 |     }
 81 | 
 82 |     double calMaxSimilarity(String centerWord, List<Term> wordList) {
 83 |         double max = -1.0F;
 84 |         for (Term term : wordList) {
 85 |             if (term.word.equals(centerWord)) {
 86 |                 return 1.0F;
 87 |             }
 88 |         }
 89 |         for (Term term : wordList) {
 90 |             double temp = this.wordSimilarity(centerWord, term.word);
 91 |             if (temp != 0.0F && temp > max) {
 92 |                 max = temp;
 93 |             }
 94 |         }
 95 | 
 96 |         if (max == -1.0F) {
 97 |             return 0.0F;
 98 |         } else {
 99 |             return max;
100 |         }
101 | 
102 |     }
103 | 
104 |     public abstract double wordSimilarity(String word1, String word2);
105 | 
106 |     public abstract double sentenceSimilarity(List<Term> sentence1Words, List<Term> sentence2Words);
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/nlp/cosinesimlarity/SimilarityAnalyzeUnfamiliarWords.java:
--------------------------------------------------------------------------------
  1 | package com.watt.core.nlp.cosinesimlarity;
  2 | 
  3 | import com.hankcs.hanlp.seg.common.Term;
  4 | 
  5 | import java.io.IOException;
  6 | import java.util.List;
  7 | 
  8 | public class SimilarityAnalyzeUnfamiliarWords extends SimilarityAnalyze {
  9 |     private Word2Vec charVec;
 10 |     private int dimension = 200;
 11 | 
 12 |     public SimilarityAnalyzeUnfamiliarWords() {
 13 |         this.vec = new Word2Vec();
 14 |         this.charVec = new Word2Vec();
 15 |         this.loadModel = false;
 16 |     }
 17 | 
 18 |     public void loadCharJavaModel(String modelPath) {
 19 |         try {
 20 |             this.charVec.loadCommonModel(modelPath);
 21 |         } catch (IOException var3) {
 22 |             var3.printStackTrace();
 23 |         }
 24 | 
 25 |         this.loadModel = true;
 26 |     }
 27 | 
 28 |     public double wordSimilarity(String word1, String word2) {
 29 |         if (!this.loadModel) {
 30 |             return 0.0F;
 31 |         } else {
 32 |             float[] word1Vec = this.getWordVector(word1);
 33 |             float[] word2Vec = this.getWordVector(word2);
 34 |             if (word1Vec == null) {
 35 |                 word1Vec = this.getNullVec(word1);
 36 |             }
 37 |             if (word2Vec == null) {
 38 |                 word2Vec = this.getNullVec(word2);
 39 |             }
 40 | 
 41 |             return this.calCosine(word1Vec, word2Vec);
 42 |         }
 43 |     }
 44 | 
 45 |     private float[] getCharVector(String word) {
 46 |         return !this.loadModel ? null : this.charVec.getWordVector(word);
 47 |     }
 48 | 
 49 |     private float[] getNullVec(String word) {
 50 |         float[] nullVec = new float[this.dimension];
 51 |         int count = 0;
 52 |         String atomSegment = AtomSegment.atomSegment(word).trim();
 53 |         String[] atomSegmentStr = atomSegment.split("\\s+");
 54 |         int i;
 55 |         for (i = 0; i < atomSegmentStr.length; ++i) {
 56 |             if (this.getCharVector(atomSegmentStr[i]) != null) {
 57 |                 ++count;
 58 | 
 59 |                 for (int j = 0; j < this.dimension; ++j) {
 60 |                     nullVec[j] += this.getCharVector(atomSegmentStr[i])[j];
 61 |                 }
 62 |             }
 63 |         }
 64 | 
 65 |         if (0 != count && 1 != count) {
 66 |             for (i = 0; i < this.dimension; ++i) {
 67 |                 nullVec[i] /= (float) count;
 68 |             }
 69 | 
 70 |             return nullVec;
 71 |         } else {
 72 |             return nullVec;
 73 |         }
 74 |     }
 75 | 
 76 |     /**
 77 |      * 句子相似度计算
 78 |      * @param sentence1Words 分词之后的文本一
 79 |      * @param sentence2Words 分词之后的文本二
 80 |      * @return 相似度评分（0-1之间的概率）
 81 |      */
 82 |     public double sentenceSimilarity(List<Term> sentence1Words, List<Term> sentence2Words) {
 83 |         if (!this.loadModel) {
 84 |             return 0.0F;
 85 |         } else if (!sentence1Words.isEmpty() && !sentence2Words.isEmpty()) {
 86 |             float sum1 = 0.0F;
 87 |             float sum2 = 0.0F;
 88 |             int count1 = 0;
 89 |             int count2 = 0;
 90 |             //计算第一句话得每一个词和另一句话中最相似的词的相似度
 91 |             for (Term sentence1Word : sentence1Words) {
 92 |                 ++count1;
 93 |                 sum1 += this.calMaxSimilarity(sentence1Word.word, sentence2Words);
 94 |             }
 95 |             //计算第二句话得每一个词和另一句话中最相似的词的相似度
 96 |             for (Term sentence2Word : sentence2Words) {
 97 |                 ++count2;
 98 |                 sum2 += this.calMaxSimilarity(sentence2Word.word, sentence1Words);
 99 |             }
100 |             //检测数量是不是为0是为了避免计算过程中产生NAN导致报错
101 |             if (count1 == 0) {
102 |                 if (count2 == 0) {
103 |                     return 0F;
104 |                 } else {
105 |                     return sum2 / count2;
106 |                 }
107 |             } else if (count2 == 0) {
108 |                 return sum1 / count1;
109 |             }
110 |             //去相似度最小的那个，能够避免长短文本比较而产生文本包含关系的问题
111 |             return Math.min(sum1 / (count1), sum2 / count2);
112 |         } else {
113 |             return 0.0F;
114 |         }
115 |     }
116 | 
117 | }
118 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/core/nlp/cosinesimlarity/Word2Vec.java:
--------------------------------------------------------------------------------
  1 | package com.watt.core.nlp.cosinesimlarity;
  2 | 
  3 | import java.io.*;
  4 | import java.nio.charset.StandardCharsets;
  5 | import java.util.HashMap;
  6 | 
  7 | public class Word2Vec {
  8 | 
  9 |     private static final int MAX_SIZE = 50;
 10 |     private HashMap<String, float[]> wordMap = new HashMap<>();
 11 |     private int words;
 12 |     private int size;
 13 | 
 14 |     private static float readFloat(InputStream is) throws IOException {
 15 |         byte[] bytes = new byte[4];
 16 |         is.read(bytes);
 17 |         return getFloat(bytes);
 18 |     }
 19 | 
 20 |     /**
 21 |      * 读取一个float
 22 |      */
 23 |     private static float getFloat(byte[] b) {
 24 |         int accum = 0;
 25 |         accum = accum | (b[0] & 0xff) << 0;
 26 |         accum = accum | (b[1] & 0xff) << 8;
 27 |         accum = accum | (b[2] & 0xff) << 16;
 28 |         accum = accum | (b[3] & 0xff) << 24;
 29 |         return Float.intBitsToFloat(accum);
 30 |     }
 31 | 
 32 |     /**
 33 |      * 读取一个字符串
 34 |      */
 35 |     private static String readString(DataInputStream dis) throws IOException {
 36 |         byte[] bytes = new byte[MAX_SIZE];
 37 |         byte b = dis.readByte();
 38 |         int i = -1;
 39 |         StringBuilder sb = new StringBuilder();
 40 |         while (b != 32 && b != 10) {
 41 |             i++;
 42 |             bytes[i] = b;
 43 |             b = dis.readByte();
 44 |             if (i == 49) {
 45 |                 sb.append(new String(bytes));
 46 |                 i = -1;
 47 |                 bytes = new byte[MAX_SIZE];
 48 |             }
 49 |         }
 50 |         String s = new String(bytes, 0, i + 1, StandardCharsets.UTF_8);
 51 |         sb.append(s);
 52 |         return sb.toString();
 53 |     }
 54 | 
 55 |     /**
 56 |      * 加载模型
 57 |      *
 58 |      * @param path 模型的路径
 59 |      */
 60 |     void loadGoogleModel(String path) throws IOException {
 61 |         DataInputStream dis = null;
 62 |         BufferedInputStream bis = null;
 63 |         double len = 0;
 64 |         float vector = 0;
 65 |         bis = new BufferedInputStream(new FileInputStream(path));
 66 |         dis = new DataInputStream(bis);
 67 |         // //读取词数
 68 |         words = Integer.parseInt(readString(dis));
 69 |         // //大小
 70 |         size = Integer.parseInt(readString(dis));
 71 |         String word;
 72 |         float[] vectors = null;
 73 |         for (int i = 0; i < words; i++) {
 74 |             word = readString(dis);
 75 |             vectors = new float[size];
 76 |             len = 0;
 77 |             for (int j = 0; j < size; j++) {
 78 |                 vector = readFloat(dis);
 79 |                 len += vector * vector;
 80 |                 vectors[j] = (float) vector;
 81 |             }
 82 |             len = Math.sqrt(len);
 83 | 
 84 |             for (int j = 0; j < size; j++) {
 85 |                 vectors[j] /= len;
 86 |             }
 87 | 
 88 |             wordMap.put(word, vectors);
 89 |             dis.read();
 90 |         }
 91 |         bis.close();
 92 |         dis.close();
 93 |     }
 94 | 
 95 |     /**
 96 |      * 加载模型
 97 |      *
 98 |      * @param path 模型的路径
 99 |      * @throws IOException 文件找不到时会抛出异常
100 |      */
101 |     void loadCommonModel(String path) throws IOException {
102 |         BufferedReader reader = new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(path))));
103 |         String wordLine = null;
104 |         //将第一行的文本省略掉，第一行分别是词数量和纬度，是不需要记录到加载内容中的
105 |         reader.readLine();
106 |         while ((wordLine = reader.readLine()) != null) {
107 |             String[] split = wordLine.trim().split("\\s+");
108 |             String key = "";
109 |             float[] value = new float[split.length - 1];
110 |             for (int i = 0; i < split.length; i++) {
111 |                 if (i == 0) {
112 |                     key = split[0];
113 |                 } else {
114 |                     value[i - 1] = Float.parseFloat(split[i]);
115 |                 }
116 |             }
117 |             wordMap.put(key, value);
118 |         }
119 |     }
120 | 
121 |     void loadJavaModel(String path) throws IOException {
122 |         try (DataInputStream dis = new DataInputStream(new BufferedInputStream(new FileInputStream(path)))) {
123 |             words = dis.readInt();
124 |             size = dis.readInt();
125 | 
126 |             float vector = 0;
127 | 
128 |             String key = null;
129 |             float[] value = null;
130 |             for (int i = 0; i < words; i++) {
131 |                 double len = 0;
132 |                 key = dis.readUTF();
133 |                 value = new float[size];
134 |                 for (int j = 0; j < size; j++) {
135 |                     vector = dis.readFloat();
136 |                     len += vector * vector;
137 |                     value[j] = vector;
138 |                 }
139 | 
140 |                 len = Math.sqrt(len);
141 | 
142 |                 for (int j = 0; j < size; j++) {
143 |                     value[j] /= len;
144 |                 }
145 |                 wordMap.put(key, value);
146 |             }
147 | 
148 |         }
149 |     }
150 | 
151 |     private float[] sum(float[] center, float[] fs) {
152 | 
153 |         if (center == null && fs == null) {
154 |             return null;
155 |         }
156 | 
157 |         if (fs == null) {
158 |             return center;
159 |         }
160 | 
161 |         if (center == null) {
162 |             return fs;
163 |         }
164 | 
165 |         for (int i = 0; i < fs.length; i++) {
166 |             center[i] += fs[i];
167 |         }
168 | 
169 |         return center;
170 |     }
171 | 
172 |     /**
173 |      * 得到词向量
174 |      */
175 |     public float[] getWordVector(String word) {
176 |         return wordMap.get(word);
177 |     }
178 | 
179 |     /**
180 |      * 设置词向量
181 |      */
182 |     public void setWordVector(String word, float[] value) {
183 |         wordMap.put(word, value);
184 |     }
185 | 
186 |     public HashMap<String, float[]> getWordMap() {
187 |         return wordMap;
188 |     }
189 | 
190 |     public int getSize() {
191 |         return size;
192 |     }
193 | 
194 | }
195 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/data/jdbc/MySqlDataSource.java:
--------------------------------------------------------------------------------
 1 | package com.watt.data.jdbc;
 2 | 
 3 | import com.mchange.v2.c3p0.ComboPooledDataSource;
 4 | import com.watt.configure.DbConfig;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | import org.springframework.beans.factory.annotation.Autowired;
 8 | import org.springframework.context.annotation.Bean;
 9 | import org.springframework.context.annotation.Configuration;
10 | 
11 | import java.beans.PropertyVetoException;
12 | 
13 | @Configuration
14 | public class MySqlDataSource {
15 |     private final Logger logger = LoggerFactory.getLogger(this.getClass());
16 |     private DbConfig dbConfig;
17 | 
18 |     @Autowired
19 |     public MySqlDataSource(DbConfig dbConfig) {
20 |         logger.info("MySqlDataSource OK");
21 |         this.dbConfig = dbConfig;
22 |     }
23 | 
24 |     @Bean
25 |     public ComboPooledDataSource getDataSource() {
26 |         ComboPooledDataSource dataSource = new ComboPooledDataSource();
27 |         try {
28 |             dataSource.setDriverClass(dbConfig.getDriverClass());
29 |             dataSource.setJdbcUrl(dbConfig.getJdbcUrl());
30 |             dataSource.setUser(dbConfig.getUser());
31 |             dataSource.setPassword(dbConfig.getPassword());
32 |             dataSource.setMinPoolSize(1);
33 |             dataSource.setMaxPoolSize(2);
34 |             dataSource.setInitialPoolSize(1);
35 |             dataSource.setMaxIdleTime(180);
36 |             dataSource.setAcquireRetryAttempts(30);
37 |             return dataSource;
38 |         } catch (PropertyVetoException e) {
39 |             e.printStackTrace();
40 |         }
41 |         return null;
42 |     }
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/data/jdbc/MySqlSessionFactoryBean.java:
--------------------------------------------------------------------------------
 1 | package com.watt.data.jdbc;
 2 | 
 3 | import com.watt.configure.MybatisConfig;
 4 | import org.mybatis.spring.SqlSessionFactoryBean;
 5 | import org.slf4j.Logger;
 6 | import org.slf4j.LoggerFactory;
 7 | import org.springframework.beans.factory.annotation.Autowired;
 8 | import org.springframework.context.annotation.Bean;
 9 | import org.springframework.context.annotation.Configuration;
10 | import org.springframework.core.io.ClassPathResource;
11 | 
12 | @Configuration
13 | public class MySqlSessionFactoryBean {
14 |     private final Logger logger = LoggerFactory.getLogger(this.getClass());
15 |     private MySqlDataSource dataSource;
16 |     private MybatisConfig mybatisConfig;
17 | 
18 |     @Autowired
19 |     public MySqlSessionFactoryBean(MySqlDataSource dataSource, MybatisConfig mybatisConfig) {
20 |         logger.info("MySqlSessionFactoryBean OK");
21 |         this.dataSource = dataSource;
22 |         this.mybatisConfig = mybatisConfig;
23 |     }
24 | 
25 |     @Bean
26 |     public SqlSessionFactoryBean getSqlSessionFactoryBean() {
27 |         SqlSessionFactoryBean sqlSessionFactoryBean = new SqlSessionFactoryBean();
28 |         sqlSessionFactoryBean.setDataSource(dataSource.getDataSource());
29 |         sqlSessionFactoryBean.setConfigLocation(new ClassPathResource(mybatisConfig.getMybatisXml()));
30 |         return sqlSessionFactoryBean;
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/data/jdbc/MySqlSessionTemplate.java:
--------------------------------------------------------------------------------
 1 | package com.watt.data.jdbc;
 2 | 
 3 | import org.mybatis.spring.SqlSessionTemplate;
 4 | import org.slf4j.Logger;
 5 | import org.slf4j.LoggerFactory;
 6 | import org.springframework.beans.factory.annotation.Autowired;
 7 | import org.springframework.context.annotation.Bean;
 8 | import org.springframework.context.annotation.Configuration;
 9 | 
10 | @Configuration
11 | public class MySqlSessionTemplate {
12 |     private static SqlSessionTemplate sqlSessionTemplate;
13 |     private final Logger logger = LoggerFactory.getLogger(this.getClass());
14 | 
15 |     private MySqlSessionFactoryBean mySqlSessionFactoryBean;
16 | 
17 |     @Autowired
18 |     public MySqlSessionTemplate(MySqlSessionFactoryBean mySqlSessionFactoryBean) {
19 |         logger.info("MySqlSessionTemplate OK");
20 |         this.mySqlSessionFactoryBean = mySqlSessionFactoryBean;
21 |     }
22 | 
23 |     @Bean
24 |     public SqlSessionTemplate getSqlSessionTemplate() {
25 |         try {
26 |             if (sqlSessionTemplate == null) {
27 |                 sqlSessionTemplate = new SqlSessionTemplate(mySqlSessionFactoryBean.getSqlSessionFactoryBean().getObject());
28 |             }
29 |             return sqlSessionTemplate;
30 |         } catch (Exception e) {
31 |             e.printStackTrace();
32 |         }
33 |         return null;
34 |     }
35 | 
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/mvc/beans/CheckResult.java:
--------------------------------------------------------------------------------
 1 | package com.watt.mvc.beans;
 2 | 
 3 | public class CheckResult {
 4 |     private String code;
 5 |     private String message;
 6 |     private String content;
 7 | 
 8 |     public CheckResult(String code, String message, String content) {
 9 |         this.code = code;
10 |         this.message = message;
11 |         this.content = content;
12 |     }
13 | 
14 |     public String getCode() {
15 |         return code;
16 |     }
17 | 
18 |     public void setCode(String code) {
19 |         this.code = code;
20 |     }
21 | 
22 |     public String getMessage() {
23 |         return message;
24 |     }
25 | 
26 |     public void setMessage(String message) {
27 |         this.message = message;
28 |     }
29 | 
30 |     public String getContent() {
31 |         return content;
32 |     }
33 | 
34 |     public void setContent(String content) {
35 |         this.content = content;
36 |     }
37 | 
38 |     @Override
39 |     public String toString() {
40 |         return "CheckResult{" +
41 |                 "code='" + code + '\'' +
42 |                 ", message='" + message + '\'' +
43 |                 ", content='" + content + '\'' +
44 |                 '}';
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/mvc/beans/PlatformResponse.java:
--------------------------------------------------------------------------------
  1 | package com.watt.mvc.beans;
  2 | 
  3 | import com.alibaba.fastjson.JSONArray;
  4 | import com.alibaba.fastjson.annotation.JSONField;
  5 | 
  6 | 
  7 | public class PlatformResponse {
  8 |     @JSONField(name = "match")
  9 |     private String match;
 10 |     @JSONField(name = "question")
 11 |     private String question;
 12 |     @JSONField(name = "score")
 13 |     private double score;
 14 |     @JSONField(name = "list")
 15 |     private JSONArray list;
 16 |     @JSONField(name = "answer")
 17 |     private String answer;
 18 |     @JSONField(name = "answer_type")
 19 |     private String answer_type;
 20 |     @JSONField(name = "ref_id")
 21 |     private String ref_id;
 22 |     @JSONField(name = "key")
 23 |     private String key;
 24 |     @JSONField(name = "scene_end")
 25 |     private String scene_end;
 26 |     @JSONField(name = "user_id")
 27 |     private String user_id;
 28 | 
 29 | 
 30 |     @JSONField(name = "media_url")
 31 |     private String media_url;
 32 | 
 33 |     public PlatformResponse() {
 34 |     }
 35 | 
 36 |     public PlatformResponse(String match, String question, double score, JSONArray list, String answer, String key, String answer_type, String ref_id, String scene_end, String user_id,String media_url) {
 37 |         this.match = match;
 38 |         this.question = question;
 39 |         this.score = score;
 40 |         this.list = list;
 41 |         this.answer = answer;
 42 |         this.key = key;
 43 |         this.answer_type = answer_type;
 44 |         this.ref_id = ref_id;
 45 |         this.scene_end = scene_end;
 46 |         this.user_id = user_id;
 47 |         this.media_url = media_url;
 48 |     }
 49 | 
 50 |     public String getKey() {
 51 |         return key;
 52 |     }
 53 | 
 54 |     public void setKey(String key) {
 55 |         this.key = key;
 56 |     }
 57 | 
 58 |     public String getMatch() {
 59 |         return match;
 60 |     }
 61 | 
 62 |     public void setMatch(String match) {
 63 |         this.match = match;
 64 |     }
 65 | 
 66 |     public String getQuestion() {
 67 |         return question;
 68 |     }
 69 | 
 70 |     public void setQuestion(String question) {
 71 |         this.question = question;
 72 |     }
 73 | 
 74 |     public double getScore() {
 75 |         return score;
 76 |     }
 77 | 
 78 |     public void setScore(double score) {
 79 |         this.score = score;
 80 |     }
 81 | 
 82 |     public JSONArray getList() {
 83 |         return list;
 84 |     }
 85 | 
 86 |     public void setList(JSONArray list) {
 87 |         this.list = list;
 88 |     }
 89 | 
 90 |     public String getAnswer() {
 91 |         return answer;
 92 |     }
 93 | 
 94 |     public void setAnswer(String answer) {
 95 |         this.answer = answer;
 96 |     }
 97 | 
 98 |     public String getScene_end() {
 99 |         return scene_end;
100 |     }
101 | 
102 |     public void setScene_end(String scene_end) {
103 |         this.scene_end = scene_end;
104 |     }
105 | 
106 |     public String getUser_id() {
107 |         return user_id;
108 |     }
109 | 
110 |     public void setUser_id(String user_id) {
111 |         this.user_id = user_id;
112 |     }
113 | 
114 | 
115 |     public String getMedia_url() {
116 |         return media_url;
117 |     }
118 | 
119 |     public void setMedia_url(String media_url) {
120 |         this.media_url = media_url;
121 |     }
122 | 
123 |     @Override
124 |     public String toString() {
125 |         return "PlatformResponse{" +
126 |                 "match='" + match + '\'' +
127 |                 ", question='" + question + '\'' +
128 |                 ", score=" + score +
129 |                 ", list='" + list + '\'' +
130 |                 ", answer='" + answer + '\'' +
131 |                 ", key='" + key + '\'' +
132 |                 ", ref_id='" + ref_id + '\'' +
133 |                 ", scene_end='" + scene_end + '\'' +
134 |                 '}';
135 |     }
136 | 
137 |     public String getAnswer_type() {
138 |         return answer_type;
139 |     }
140 | 
141 |     public void setAnswer_type(String answer_type) {
142 |         this.answer_type = answer_type;
143 |     }
144 | 
145 |     public String getRef_id() {
146 |         return ref_id;
147 |     }
148 | 
149 |     public void setRef_id(String ref_id) {
150 |         this.ref_id = ref_id;
151 |     }
152 | }
153 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/mvc/beans/QAAnalyzeResult.java:
--------------------------------------------------------------------------------
 1 | package com.watt.mvc.beans;
 2 | 
 3 | public class QAAnalyzeResult {
 4 |     private double score;
 5 |     private String key;
 6 |     private String match;
 7 | 
 8 |     public QAAnalyzeResult() {
 9 |     }
10 | 
11 |     public QAAnalyzeResult(double score, String key, String match) {
12 |         this.score = score;
13 |         this.key = key;
14 |         this.match = match;
15 |     }
16 | 
17 |     public double getScore() {
18 |         return score;
19 |     }
20 | 
21 |     public void setScore(double score) {
22 |         this.score = score;
23 |     }
24 | 
25 |     public String getKey() {
26 |         return key;
27 |     }
28 | 
29 |     public void setKey(String key) {
30 |         this.key = key;
31 |     }
32 | 
33 |     public String getMatch() {
34 |         return match;
35 |     }
36 | 
37 |     public void setMatch(String match) {
38 |         this.match = match;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/mvc/controller/CorpusController.java:
--------------------------------------------------------------------------------
 1 | package com.watt.mvc.controller;
 2 | 
 3 | import com.watt.util.FileUtils;
 4 | import com.watt.util.NLPUtils;
 5 | import org.springframework.web.bind.annotation.RequestMapping;
 6 | import org.springframework.web.bind.annotation.RequestParam;
 7 | import org.springframework.web.bind.annotation.RestController;
 8 | 
 9 | import java.io.File;
10 | import java.io.IOException;
11 | import java.util.List;
12 | 
13 | @RestController
14 | public class CorpusController {
15 |     /**
16 |      * 预处理数据
17 |      */
18 |     @RequestMapping("/loadCorpus")
19 |     public String loadCorpus(@RequestParam(name = "path") String path) {
20 |         List<File> files = FileUtils.listFiles(path);
21 |         files.forEach(file -> {
22 |             try {
23 |                 NLPUtils.textPreprocessing(file, file.getParent() + "/dump/" + file.getName() + ".seg.txt");
24 |             } catch (IOException e) {
25 |                 e.printStackTrace();
26 |             }
27 |         });
28 |         return "success";
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/mvc/controller/QAController.java:
--------------------------------------------------------------------------------
  1 | package com.watt.mvc.controller;
  2 | 
  3 | import com.alibaba.fastjson.JSONArray;
  4 | import com.hankcs.hanlp.HanLP;
  5 | import com.hankcs.hanlp.seg.Segment;
  6 | import com.hankcs.hanlp.seg.common.Term;
  7 | import com.hankcs.hanlp.tokenizer.NotionalTokenizer;
  8 | import com.hankcs.lucene.HanLPAnalyzer;
  9 | import com.watt.configure.LuceneConfig;
 10 | import com.watt.core.QuestionsIndex;
 11 | import com.watt.core.dictionary.CoreAbbreviationDictionary;
 12 | import com.watt.core.dictionary.CoreStopWordsDictionary;
 13 | import com.watt.core.dictionary.MyCustomDictionary;
 14 | import com.watt.core.nlp.cosinesimlarity.SimilarityAnalyze;
 15 | import com.watt.core.nlp.cosinesimlarity.SimilarityAnalyzeUnfamiliarWords;
 16 | import com.watt.mvc.beans.CheckResult;
 17 | import com.watt.mvc.beans.PlatformResponse;
 18 | import com.watt.mvc.beans.QAAnalyzeResult;
 19 | import com.watt.mvc.service.QAService;
 20 | import com.watt.util.CommonUtils;
 21 | import org.apache.lucene.document.Document;
 22 | import org.apache.lucene.queryparser.classic.ParseException;
 23 | import org.apache.lucene.queryparser.classic.QueryParser;
 24 | import org.apache.lucene.search.IndexSearcher;
 25 | import org.apache.lucene.search.Query;
 26 | import org.apache.lucene.search.ScoreDoc;
 27 | import org.apache.lucene.search.TopDocs;
 28 | import org.slf4j.Logger;
 29 | import org.slf4j.LoggerFactory;
 30 | import org.springframework.beans.factory.annotation.Autowired;
 31 | import org.springframework.web.bind.annotation.RequestMapping;
 32 | import org.springframework.web.bind.annotation.RestController;
 33 | 
 34 | import javax.servlet.http.HttpServletRequest;
 35 | import java.io.File;
 36 | import java.io.FileInputStream;
 37 | import java.io.IOException;
 38 | import java.io.ObjectInputStream;
 39 | import java.util.HashMap;
 40 | import java.util.List;
 41 | import java.util.Map;
 42 | 
 43 | /**
 44 |  * 对话实现类
 45 |  */
 46 | @RestController
 47 | public class QAController {
 48 |     private final Logger logger = LoggerFactory.getLogger(this.getClass());
 49 |     private LuceneConfig luceneConfig;
 50 |     private QuestionsIndex questionsIndex;
 51 |     private Segment segment;
 52 |     private SimilarityAnalyze similarAnalyze = new SimilarityAnalyzeUnfamiliarWords();
 53 |     private QAService qaService;
 54 |     private MyCustomDictionary myCustomDictionary;
 55 |     private Map<String,Double> tfidf = null;
 56 |     @Autowired
 57 |     public QAController(LuceneConfig luceneConfig, QuestionsIndex questionsIndex, QAService qaService, MyCustomDictionary myCustomDictionary) {
 58 |         this.luceneConfig = luceneConfig;
 59 |         this.questionsIndex = questionsIndex;
 60 |         this.qaService = qaService;
 61 |         this.myCustomDictionary = myCustomDictionary;
 62 |         //加载分词热词
 63 |         myCustomDictionary.initDictionary();
 64 |         //加载停用词词典
 65 |         myCustomDictionary.initStopWords();
 66 |         //加载全、简称词典将简称字段加入到分词热词中（全称不加入）
 67 |         myCustomDictionary.initAbbreviation();
 68 |         //将所有的向量加载
 69 |         initWordVectors();
 70 |         //1.词向量矫正 2.检索所有维护的词林同义词词典 3.并将所有的税务同义词加入到热词中
 71 |         initCilin();
 72 |         //初始化分词服务
 73 |         initSeg();
 74 |         //初始化tfidf模型
 75 |         initTfidf();
 76 |     }
 77 | 
 78 |     /**
 79 |      * 初始化分词服务
 80 |      */
 81 |     private void initSeg() {
 82 |         segment = HanLP.newSegment();
 83 |         NotionalTokenizer.SEGMENT = segment;
 84 |     }
 85 | 
 86 |     /**
 87 |      * 初始化加载词向量
 88 |      */
 89 |     private void initWordVectors() {
 90 |         similarAnalyze.loadGoogleModel(luceneConfig.getVectorPath());
 91 |         logger.info("词向量加载完成");
 92 |     }
 93 | 
 94 |     private void initTfidf(){
 95 |         try {
 96 |             ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File(luceneConfig.getTfidfPath())));
 97 |             tfidf = (HashMap<String,Double>)in.readObject();
 98 |         }catch (Exception e){
 99 |             logger.error("tfidf模型没有加载成功");
100 |         }
101 |     }
102 | 
103 |     /**
104 |      * 加载词林词典将词林同义词赋值到向量中
105 |      */
106 |     private void initCilin() {
107 |         //查询type类型为同义词的所有词汇，去掉like的词语
108 |         //通过同义词、和专业词词典来重新校准向量值
109 |         List<Map<String, String>> lines = qaService.querySynonymsAll("=");
110 |         lines.forEach(line -> {
111 |             String[] synonyms = line.get("synonym").trim().split("\\s+");
112 |             for (String synonym : synonyms) {
113 |                 float[] vector = similarAnalyze.getVec().getWordVector(synonym);
114 |                 if (vector != null) {
115 |                     for (String synonym1 : synonyms) {
116 |                         similarAnalyze.getVec().setWordVector(synonym1, vector);
117 |                         //如果存在自定义的同义词需要在此处加入到用户自定义分词词典中去
118 |                     }
119 |                     break;
120 |                 }
121 |             }
122 |         });
123 |         logger.info("同义词词林校准向量加载完成：" + lines.size());
124 |     }
125 | 
126 |     /**
127 |      * 对话接口提供方法，分发多轮还是问答
128 |      */
129 |     @RequestMapping("/getAnswer")
130 |     public PlatformResponse query(HttpServletRequest request) throws IOException, ParseException {
131 |         String question = request.getParameter("question").trim().replaceAll("\\s*", "");
132 |         JSONArray resultArray = this.searchAndCalculate(question);
133 |         if (resultArray.isEmpty()) {
134 |             return new PlatformResponse();
135 |         }
136 |         //将分析的TOP最高的问题进行评估，返回一个评分最高的答案
137 |         resultArray = CommonUtils.arrayCompare(resultArray);
138 |         QAAnalyzeResult qaAnalyzeResult = resultArray.getObject(0, QAAnalyzeResult.class);
139 |         Map<String, String> answer = qaService.queryAnswer(qaAnalyzeResult.getKey());
140 |         String media_type = answer.get("MEDIA_TYPE");
141 |         String media_url = media_type.equals("IMG") || media_type.equals("GT") ?
142 |                 qaService.queryMediaUrlByREF_ID(answer.get("REF_ID"))
143 |                 : null;
144 |         //保存日志
145 |         try {
146 |             qaService.createLog(question, qaAnalyzeResult.getScore() + "", qaAnalyzeResult.getKey(), "manager", "");
147 |         } catch (Exception e) {
148 |             e.printStackTrace();
149 |         }
150 |         return new PlatformResponse(qaAnalyzeResult.getMatch(),
151 |                 question, qaAnalyzeResult.getScore(), resultArray,
152 |                 answer.get("TEXT_ANS"), qaAnalyzeResult.getKey(), answer.get("MEDIA_TYPE"), answer.get("REF_ID"), "1", "", media_url);
153 |     }
154 | 
155 | 
156 | 
157 |     /**
158 |      * getAnswer公共代码
159 |      */
160 |     private JSONArray searchAndCalculate(String question) throws IOException, ParseException {
161 |         //将目标问题进行分词，留着分析用
162 |         List<Term> seg_question = CoreStopWordsDictionary.removeStopWords(CoreAbbreviationDictionary.convertAbbreviationToFull(segment.seg(question)));
163 |         logger.info("全称转换后：" + seg_question);
164 |         IndexSearcher searcher = luceneConfig.getIndexSearcher();
165 |         Query query = new QueryParser(luceneConfig.getIndexKey(), new HanLPAnalyzer()).parse(question);
166 |         TopDocs result = searcher.search(query, 100);
167 |         JSONArray resultArray = new JSONArray();
168 |         for (ScoreDoc doc : result.scoreDocs) {
169 |             Document document = searcher.doc(doc.doc);
170 |             String question2 = document.get("questions");
171 |             List<Term> seg_question2 = segment.seg(question2);
172 |             double score = similarAnalyze.sentenceSimilarity(seg_question, seg_question2);
173 |             resultArray.add(new QAAnalyzeResult(score, document.get("key"), question2));
174 |         }
175 |         return resultArray;
176 |     }
177 | 
178 |     /**
179 |      * 初始化时创建索引
180 |      */
181 |     @RequestMapping("/createIndex")
182 |     public CheckResult createIndex() {
183 |         questionsIndex.createIndex();
184 |         return new CheckResult("000", "success", "");
185 |     }
186 | 
187 |     /**
188 |      * 重新加载全简称词典
189 |      */
190 |     @RequestMapping("/reloadAbbreviation")
191 |     public CheckResult reloadAbbreviation() {
192 |         myCustomDictionary.initAbbreviation();
193 |         return new CheckResult("000", "success", "");
194 |     }
195 | 
196 |     /**
197 |      * 相似性分析核心计算类
198 |      */
199 |     public SimilarityAnalyze getSimilarAnalyze() {
200 |         return similarAnalyze;
201 |     }
202 | 
203 |     /**
204 |      * 获取统一的分词对象
205 |      */
206 |     public Segment getSegment() {
207 |         return segment;
208 |     }
209 | 
210 | }
211 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/mvc/dao/QADao.java:
--------------------------------------------------------------------------------
 1 | package com.watt.mvc.dao;
 2 | 
 3 | import com.watt.data.jdbc.MySqlSessionTemplate;
 4 | import org.mybatis.spring.SqlSessionTemplate;
 5 | import org.springframework.beans.factory.annotation.Autowired;
 6 | import org.springframework.stereotype.Repository;
 7 | 
 8 | import java.util.List;
 9 | import java.util.Map;
10 | 
11 | @Repository
12 | public class QADao {
13 |     private SqlSessionTemplate sqlSessionTemplate;
14 | 
15 |     @Autowired
16 |     public QADao(MySqlSessionTemplate sqlSessionTemplate) {
17 |         this.sqlSessionTemplate = sqlSessionTemplate.getSqlSessionTemplate();
18 |     }
19 | 
20 |     /**
21 |      * 查询所有的问题列表，根据start 和 end作为大数据下的分页
22 |      *
23 |      */
24 |     public List<Map<String, String>> queryQuestions(Map param) {
25 |         return sqlSessionTemplate.selectList("QADao.queryQuestions", param);
26 |     }
27 | 
28 |     /**
29 |      * 查找答案
30 |      * @param map kwid
31 |      * @return 一个答案
32 |      */
33 |     public List<Map<String, String>> queryAnswer(Map map) {
34 |         return sqlSessionTemplate.selectList("QADao.queryAnswer", map);
35 |     }
36 |     /**
37 |      * 查询所有的词典列表
38 |      */
39 |     public List<Map<String, String>> queryDictionaryAll() {
40 |         return sqlSessionTemplate.selectList("QADao.queryDictionaryAll", null);
41 |     }
42 | 
43 |     /**
44 |      * 根据条件查询同义词词典
45 |      */
46 |     public List<Map<String, String>> querySynonymsAll(Map<String, String> param) {
47 |         return sqlSessionTemplate.selectList("QADao.querySynonymsAll", param);
48 |     }
49 |     /**
50 |      * 添加税务专用名词词典
51 |      *
52 |      * @param word 词条或语料
53 |      * @return int
54 |      */
55 |     public int addTaxDictionaryWord(String word) {
56 |         return sqlSessionTemplate.insert("QADao.addTaxDictionaryWord", word);
57 |     }
58 | 
59 |     /**
60 |      * 向日志表中插入一条数据，写一条日志
61 |      */
62 |     public int createLog(Map<String, String> param) {
63 |         return sqlSessionTemplate.insert("QADao.createLog", param);
64 |     }
65 |     /**
66 |      * 查询同义词列表
67 |      */
68 |     public List<String> querySynonyms(){
69 |         return sqlSessionTemplate.selectList("QADao.querySynonyms");
70 |     }
71 | 
72 |     /**
73 |      * 查询全、简称词典
74 |      */
75 |     public List<Map<String, String>> queryAbbreviation(Map<String, String> param) {
76 |         return sqlSessionTemplate.selectList("QADao.queryAbbreviation", param);
77 |     }
78 | 
79 |     /**
80 |      * 查询所有停用词
81 |      */
82 |     public List<String> queryStopWordsAll() {
83 |         return sqlSessionTemplate.selectList("QADao.queryStopWordsAll");
84 |     }
85 | 
86 |     public Map queryMediaUrlByREF_ID(Map<String, String> param) {
87 |         return sqlSessionTemplate.selectOne("QADao.queryMediaUrlByREF_ID",param);
88 |     }
89 | 
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/mvc/dao/QADao.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE mapper
 3 |         PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
 4 |         "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
 5 | <mapper namespace="QADao">
 6 |     <select id="queryQuestions" parameterType="Map" resultType="Map">
 7 |         SELECT Q.question                   QUESTION,
 8 |                cast(Q.id as char)        AS QUESTION_ID,
 9 |                cast(Q.answer_id as char) AS KW_ID
10 |         FROM knowledge_qa_question Q,
11 |              knowledge_qa_answer A
12 |         WHERE Q.answer_id = A.id
13 |         LIMIT #{start , jdbcType=INTEGER}, #{end , jdbcType=INTEGER}
14 |     </select>
15 |     <select id="queryAnswer" parameterType="Map" resultType="Map">
16 |         SELECT answer                   TEXT_ANS,
17 |                IFNULL(reference_id, '') REF_ID,
18 |                MEDIA_TYPE
19 |         FROM knowledge_qa_answer
20 |         WHERE id = #{key , jdbcType=VARCHAR}
21 |     </select>
22 |     <select id="queryDictionaryAll" parameterType="String" resultType="Map">
23 |         SELECT WORD
24 |         FROM knowlede_dictionary_custom
25 |     </select>
26 |     <insert id="addTaxDictionaryWord" parameterType="String">
27 |         INSERT INTO knowlede_dictionary_custom (WORD)
28 |         VALUES (#{word , jdbcType=VARCHAR})
29 |     </insert>
30 |     <select id="querySynonyms" parameterType="String" resultType="String">
31 |         SELECT CONCAT(ID, TYPE, ' ', SYNONYM) TEXT
32 |         FROM knowledge_dictionary_synonym
33 |     </select>
34 |     <!-- 查询词林同义词词典，根据条件查询同义词等 -->
35 |     <select id="querySynonymsAll" parameterType="Map" resultType="Map">
36 |         SELECT * FROM knowledge_dictionary_synonym
37 |         WHERE del_flag = '0'
38 |         <if test="type != null and type != ''">
39 |             AND TYPE = #{type , jdbcType=VARCHAR}
40 |         </if>
41 |     </select>
42 |     <insert id="createSynonym" parameterType="Map">
43 |         INSERT INTO knowledge_dictionary_synonym (id, synonym)
44 |         VALUES (#{id , jdbcType=VARCHAR}, #{synonym , jdbcType=VARCHAR})
45 |     </insert>
46 |     <!--查询全、简称词典数据-->
47 |     <select id="queryAbbreviation" parameterType="Map" resultType="Map">
48 |         SELECT * FROM knowledge_dictionary_abbreviation
49 |         WHERE del_flag = '0'
50 |         <if test="abbr_name != null and abbr_name != ''">
51 |             AND ABBR_NAME = #{abbr_name , jdbcType=VARCHAR}
52 |         </if>
53 |         <if test="full_name != null and full_name != ''">
54 |             AND FULL_NAME = #{full_name , jdbcType=VARCHAR}
55 |         </if>
56 |     </select>
57 |     <insert id="createLog" parameterType="Map">
58 |         INSERT INTO knowledge_qa_logs (id, question, score, question_id, channel_id)
59 |         VALUES (#{id , jdbcType=VARCHAR}, #{question , jdbcType=VARCHAR}, #{score , jdbcType=VARCHAR},
60 |                 #{kw_id , jdbcType=VARCHAR}, #{channel_id , jdbcType=VARCHAR})
61 |     </insert>
62 | 
63 |     <select id="queryStopWordsAll" resultType="String">
64 |         SELECT word
65 |         FROM knowledge_dictionary_stopwords
66 |     </select>
67 |     <select id="queryMediaUrlByREF_ID" resultType="Map" parameterType="Map">
68 |         SELECT MEDIA_URL
69 |         FROM knowledge_qa_media
70 |         WHERE MEDIA_ID = #{REF_ID , jdbcType=VARCHAR}
71 |     </select>
72 | </mapper>


--------------------------------------------------------------------------------
/src/main/java/com/watt/mvc/service/QAService.java:
--------------------------------------------------------------------------------
  1 | package com.watt.mvc.service;
  2 | 
  3 | import com.watt.mvc.dao.QADao;
  4 | import org.springframework.beans.factory.annotation.Autowired;
  5 | import org.springframework.stereotype.Service;
  6 | 
  7 | import java.util.HashMap;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | 
 11 | @Service
 12 | public class QAService {
 13 |     private QADao qaDao;
 14 | 
 15 |     @Autowired
 16 |     public QAService(QADao qaDao) {
 17 |         this.qaDao = qaDao;
 18 |     }
 19 | 
 20 |     /**
 21 |      * 查询所有的问题列表，根据start 和 end 作为大数据下的分页
 22 |      */
 23 |     public List<Map<String, String>> queryQuestions(int start, int end) {
 24 |         Map param = new HashMap<String, Integer>();
 25 |         param.put("start", start);
 26 |         param.put("end", end);
 27 |         return qaDao.queryQuestions(param);
 28 |     }
 29 | 
 30 |     /**
 31 |      * 查询所有的词典列表
 32 |      */
 33 |     public List<Map<String, String>> queryDictionaryAll() {
 34 |         return qaDao.queryDictionaryAll();
 35 |     }
 36 | 
 37 |     /**
 38 |      * 添加税务专用名词词典
 39 |      *
 40 |      * @param word 词条或语料
 41 |      * @return int
 42 |      */
 43 |     public int addTaxDictionaryWord(String word) {
 44 |         return qaDao.addTaxDictionaryWord(word);
 45 |     }
 46 | 
 47 |     /**
 48 |      * 查询同义词列表
 49 |      */
 50 |     public List<String> querySynonyms() {
 51 |         return qaDao.querySynonyms();
 52 |     }
 53 | 
 54 |     /**
 55 |      * 根据条件查询同义词词典
 56 |      */
 57 |     public List<Map<String, String>> querySynonymsAll(String type) {
 58 |         Map<String, String> param = new HashMap<>();
 59 |         param.put("type", type);
 60 |         return qaDao.querySynonymsAll(param);
 61 |     }
 62 | 
 63 |     /**
 64 |      * 根据问题的id查询答案
 65 |      */
 66 |     public Map<String, String> queryAnswer(String key) {
 67 |         Map<String, String> map = new HashMap<>();
 68 |         map.put("key", key);
 69 |         List<Map<String, String>> result = qaDao.queryAnswer(map);
 70 |         if (result == null || result.isEmpty()) {
 71 |             return null;
 72 |         } else {
 73 |             return result.get(0);
 74 |         }
 75 |     }
 76 | 
 77 |     /**
 78 |      * 查询全、简称词典
 79 |      */
 80 |     public List<Map<String, String>> queryAbbreviation(Map<String, String> param) {
 81 |         return qaDao.queryAbbreviation(param);
 82 |     }
 83 | 
 84 |     /**
 85 |      * 向日志表中插入一条数据，写一条日志
 86 |      */
 87 |     public int createLog(String question, String score, String kw_id, String channel_id, String user_id) {
 88 |         Map<String, String> param = new HashMap<>();
 89 |         param.put("question", question);
 90 |         param.put("score", score);
 91 |         param.put("kw_id", kw_id);
 92 |         param.put("channel_id", channel_id);
 93 |         param.put("user_id", user_id);
 94 |         return qaDao.createLog(param);
 95 |     }
 96 | 
 97 |     public List<String> queryStopWordsAll() {
 98 |         return qaDao.queryStopWordsAll();
 99 |     }
100 | 
101 |     public String queryMediaUrlByREF_ID(String key) {
102 |         Map<String, String> param = new HashMap<>();
103 |         param.put("REF_ID", key);
104 |         Map map = qaDao.queryMediaUrlByREF_ID(param);
105 |         return map.get("MEDIA_URL").toString();
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/util/CommonUtils.java:
--------------------------------------------------------------------------------
 1 | package com.watt.util;
 2 | 
 3 | import com.alibaba.fastjson.JSONArray;
 4 | import com.alibaba.fastjson.JSONObject;
 5 | 
 6 | public class CommonUtils {
 7 |     /**
 8 |      * 给target按照score的大小重新从大到小排序
 9 |      * @return 排序后的array
10 |      */
11 |     public static JSONArray arrayCompare(JSONArray target) {
12 |         JSONArray result = new JSONArray();
13 |         for (int i = 0; target.size() > 0; i++) {
14 |             JSONObject iobj = findMaxScore(target);
15 |             removeOne(target, iobj);
16 |             result.add(iobj);
17 |         }
18 |         return result;
19 |     }
20 | 
21 |     /**
22 |      * 获得JSON列表的top N
23 |      *
24 |      * @param target 目表array
25 |      * @param n      多少个
26 |      */
27 |     public static JSONArray getTop(JSONArray target, int n) {
28 |         JSONArray result = new JSONArray();
29 |         for (int i = 0; i < target.size() && i < n; i++) {
30 |             result.add(target.get(i));
31 |         }
32 |         return result;
33 |     }
34 | 
35 |     /**
36 |      * 遍历整个list找出一个最大的Object
37 |      */
38 |     private static JSONObject findMaxScore(JSONArray target) {
39 |         JSONObject firstObject = target.getJSONObject(0);
40 |         for (int i = 1; i < target.size(); i++) {
41 |             if (target.getJSONObject(i).getDouble("score") > firstObject.getDouble("score")) {
42 |                 firstObject = target.getJSONObject(i);
43 |             }
44 |         }
45 |         return firstObject;
46 |     }
47 | 
48 |     /**
49 |      * 移除某个key为制定的object
50 |      */
51 |     private static void removeOne(JSONArray target, JSONObject one) {
52 |         for (int i = 0; i < target.size(); i++) {
53 |             if (target.getJSONObject(i).get("questionID").equals(one.get("questionID"))) {
54 |                 target.remove(i);
55 |                 return;
56 |             }
57 |         }
58 |     }
59 | 
60 | //    public static void main(String[] args) {
61 | //        JSONArray array = new JSONArray();
62 | //        JSONObject object1 = new JSONObject();
63 | //        JSONObject object2 = new JSONObject();
64 | //        JSONObject object3 = new JSONObject();
65 | //        JSONObject object4 = new JSONObject();
66 | //
67 | //        object1.put("score",2.44);
68 | //        object1.put("key",2.44);
69 | //        object2.put("score",4.454);
70 | //        object2.put("key",4.454);
71 | //        object3.put("score",3.24);
72 | //        object3.put("key",3.24);
73 | //        object4.put("score",16.00);
74 | //        object4.put("key",16.00);
75 | //        array.add(object1);
76 | //        array.add(object2);
77 | //        array.add(object3);
78 | //        array.add(object4);
79 | //        System.out.println(arrayCompare(array).toJSONString());
80 | //    }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/util/FileUtils.java:
--------------------------------------------------------------------------------
 1 | package com.watt.util;
 2 | 
 3 | import java.io.*;
 4 | import java.nio.charset.Charset;
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | import java.util.Objects;
 8 | 
 9 | public class FileUtils {
10 |     /**
11 |      * 清除某个目录下所有的文件
12 |      *
13 |      * @param path 目标path
14 |      */
15 |     public static void clearPath(String path) {
16 |         File file = new File(path);
17 |         if (file.isDirectory()) {
18 |             File[] childFiles = file.listFiles();
19 |             if (childFiles == null || file.length() == 0) {
20 |                 return;
21 |             }
22 |             for (File childFile : childFiles) {
23 |                 childFile.delete();
24 |             }
25 |         } else {
26 |             file.delete();
27 |         }
28 |     }
29 | 
30 |     /**
31 |      * 将文件目录下所有文件全部罗列出来进行
32 |      *
33 |      * @param path 父文件路径
34 |      * @return 所有的文件列表
35 |      */
36 |     public static List<File> listFiles(String path) {
37 |         List<File> result = new ArrayList<>();
38 |         File file = new File(path);
39 |         for (File one : Objects.requireNonNull(file.listFiles())) {
40 |             if (one.isDirectory()) {
41 |                 result.addAll(listFiles(one.getPath()));
42 |             } else {
43 |                 result.add(one);
44 |             }
45 |         }
46 |         return result;
47 |     }
48 | 
49 |     /**
50 |      * 给定文件返回读取方法
51 |      *
52 |      * @param file 目标文件
53 |      * @return 大文件读取的文件流
54 |      */
55 |     public static BufferedReader getFileReader(File file) throws FileNotFoundException {
56 |         return new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(file)), Charset.forName("GBK")));
57 |     }
58 | 
59 |     /**
60 |      * 给定文件返回读取方法
61 |      *
62 |      * @param file 目标文件
63 |      * @return 大文件读取的文件流
64 |      */
65 |     public static BufferedReader getFileReader(File file, Charset charset) throws FileNotFoundException {
66 |         return new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(file)), charset));
67 |     }
68 |     /**
69 |      * 给定文件返回读取方法
70 |      *
71 |      * @param file 目标文件
72 |      * @return 大文件读取的文件流
73 |      */
74 |     public static BufferedReader getFileReader(String file) throws FileNotFoundException {
75 |         return new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(file))));
76 |     }
77 | 
78 |     public static String readLine(BufferedReader reader) throws IOException {
79 |         return reader.readLine();
80 |     }
81 | 
82 |     /**
83 |      * @throws FileNotFoundException 文件找不到
84 |      */
85 |     public static BufferedWriter getBufferedWriter(String fileName) throws IOException {
86 |         File createFile = new File(fileName);
87 |         if (!createFile.exists()) {
88 |             createFile.createNewFile();
89 |         }
90 |         return new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(createFile))));
91 |     }
92 |     public static void main(String[] args){
93 |         List<File>  files = listFiles("/root/data/corpus/");
94 |         files.forEach(file -> {
95 |             System.out.println(file.getParent()+","+file.getPath());
96 |         });
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/java/com/watt/util/NLPUtils.java:
--------------------------------------------------------------------------------
 1 | package com.watt.util;
 2 | 
 3 | import com.hankcs.hanlp.HanLP;
 4 | import com.hankcs.hanlp.seg.common.Term;
 5 | import com.hankcs.hanlp.tokenizer.NotionalTokenizer;
 6 | 
 7 | import java.io.BufferedReader;
 8 | import java.io.BufferedWriter;
 9 | import java.io.File;
10 | import java.io.IOException;
11 | import java.util.List;
12 | 
13 | public class NLPUtils {
14 |     /**
15 |      * 训练语料文件预处理
16 |      * 1.分词
17 |      * 2.繁转简
18 |      * 3.去停用词
19 |      *
20 |      * @param file   目标文件
21 |      * @param target 存储到的目标文件
22 |      */
23 |     public static void textPreprocessing(File file, String target) throws IOException {
24 |         BufferedReader reader = FileUtils.getFileReader(file);
25 |         BufferedWriter writer = FileUtils.getBufferedWriter(target);
26 |         String wordLine = null;
27 |         while ((wordLine = reader.readLine()) != null) {
28 |             wordLine = HanLP.tw2s(wordLine);
29 |             List<Term> termList = NotionalTokenizer.segment(wordLine);
30 |             String line = convertTermtoString(termList);
31 |             writer.newLine();
32 |             writer.write(line);
33 |         }
34 |         writer.flush();
35 |         reader.close();
36 |         writer.close();
37 |     }
38 | 
39 |     /**
40 |      * 将分词数据拼接成字符串
41 |      */
42 |     public static String convertTermtoString(List<Term> termList, String segChar) {
43 |         StringBuffer buffer = new StringBuffer();
44 |         termList.forEach(term -> {
45 |             buffer.append(term.word).append(segChar);
46 |         });
47 |         return buffer.toString().trim();
48 |     }
49 | 
50 |     /**
51 |      * 将分词数据拼接成字符串
52 |      */
53 |     public static String convertTermtoString(List<Term> termList) {
54 |         return convertTermtoString(termList," ");
55 |     }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/resources/application.yml:
--------------------------------------------------------------------------------
 1 | db:
 2 |   mysql:
 3 |       driverClass: com.mysql.cj.jdbc.Driver #这个是mysql驱动配置不需要改动
 4 |       jdbcUrl: jdbc:mysql://10.111.29.21:3306/tax_knowledge?useUnicode=true&characterEncoding=gb2312 #mysql地址端口号配置
 5 |       user: root #mysql用户名和密码配置
 6 |       password: Abcd1234! #mysql用户名和密码配置
 7 |   mybatis:
 8 |       mybatisXml: /mybatis.xml #mybatis文件配置路径
 9 | lucene:
10 |   root: /root/lucene/  #Lucene索引位置的根目录
11 |   indexKey: questionWithSynonyms #这个是Lucene查询、建立索引的时候共享的一个key，这个key可以一直不改变
12 |   vectorPath: /root/data/wiki_chinese_word2vec.bin #词向量物理路径
13 |   tfidfPath: /root/data/tfidf #tfidf模型路径


--------------------------------------------------------------------------------
/src/main/resources/hanlp.properties:
--------------------------------------------------------------------------------
 1 | #本配置文件中的路径的根目录，根目录+其他路径=完整路径（支持相对路径，请参考：https://github.com/hankcs/HanLP/pull/254）
 2 | #Windows用户请注意，路径分隔符统一使用/
 3 | root=/root/
 4 | #核心词典路径
 5 | CoreDictionaryPath=data/dictionary/CoreNatureDictionary.txt
 6 | #2元语法词典路径
 7 | BiGramDictionaryPath=data/dictionary/CoreNatureDictionary.ngram.txt
 8 | #停用词词典路径
 9 | CoreStopWordDictionaryPath=data/dictionary/stopwords.txt
10 | #同义词词典路径
11 | CoreSynonymDictionaryDictionaryPath=data/dictionary/synonym/CoreSynonym.txt
12 | #人名词典路径
13 | PersonDictionaryPath=data/dictionary/person/nr.txt
14 | #人名词典转移矩阵路径
15 | PersonDictionaryTrPath=data/dictionary/person/nr.tr.txt
16 | #繁简词典根目录
17 | tcDictionaryRoot=data/dictionary/tc
18 | #自定义词典路径，用;隔开多个自定义词典，空格开头表示在同一个目录，使用“文件名 词性”形式则表示这个词典的词性默认是该词性。优先级递减。
19 | #另外data/dictionary/custom/CustomDictionary.txt是个高质量的词库，请不要删除。所有词典统一使用UTF-8编码。
20 | CustomDictionaryPath=data/dictionary/custom/CustomDictionary.txt; 现代汉语补充词库.txt; 全国地名大全.txt ns; 人名词典.txt; 机构名词典.txt; 上海地名.txt ns;data/dictionary/person/nrf.txt nrf;
21 | #CRF分词模型路径
22 | CRFSegmentModelPath=data/model/segment/CRFSegmentModel.txt
23 | #HMM分词模型
24 | HMMSegmentModelPath=data/model/segment/HMMSegmentModel.bin
25 | #分词结果是否展示词性
26 | ShowTermNature=true
27 | #IO适配器，实现com.hankcs.hanlp.corpus.io.IIOAdapter接口以在不同的平台（Hadoop、Redis等）上运行HanLP
28 | #默认的IO适配器如下，该适配器是基于普通文件系统的。
29 | #IOAdapter=com.hankcs.hanlp.corpus.io.FileIOAdapter


--------------------------------------------------------------------------------
/src/main/resources/mybatis.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE configuration PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
 3 |         "http://mybatis.org/dtd/mybatis-3-config.dtd">
 4 | <configuration>
 5 |     <mappers>
 6 |         <mapper resource="com/inspur/tax/mvc/dao/QADao.xml"/>
 7 |         <mapper resource="com/inspur/tax/scene/dao/SceneDao.xml"/>
 8 |         <mapper resource="com/inspur/tax/mvc/dao/ExposedDao.xml"/>
 9 |         <mapper resource="com/inspur/tax/commen/dao/UserDao.xml"/>
10 |     </mappers>
11 | </configuration>
12 | 


--------------------------------------------------------------------------------
/src/main/webapp/WEB-INF/lib/hanlp-1.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/watt1010/knowledge/9141c30f039bc924a0ef8c0e83e5233d1042ce04/src/main/webapp/WEB-INF/lib/hanlp-1.7.2.jar


--------------------------------------------------------------------------------
/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 | <!DOCTYPE web-app PUBLIC
2 |         "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN"
3 |         "http://java.sun.com/dtd/web-app_2_3.dtd" >
4 | 
5 | <web-app>
6 |     <display-name>Archetype Created Web Application</display-name>
7 | </web-app>
8 | 


--------------------------------------------------------------------------------
/src/main/webapp/index.jsp:
--------------------------------------------------------------------------------
1 | <html>
2 | <body>
3 | <h2>Hello World!</h2>
4 | </body>
5 | </html>
6 | 


--------------------------------------------------------------------------------
/src/test/java/MapCount.java:
--------------------------------------------------------------------------------
 1 | import java.io.Serializable;
 2 | import java.util.HashMap;
 3 | 
 4 | public class MapCount<T> implements Serializable {
 5 |     private HashMap<T, int[]> data = null;
 6 | 
 7 |     //[0] 表示词频
 8 |     //【1】文档数
 9 |     //【2】index
10 |     MapCount() {
11 |         this.data = new HashMap<>();
12 |     }
13 | 
14 |     public MapCount(int initialCapacity) {
15 |         this.data = new HashMap<>(initialCapacity);
16 |     }
17 | 
18 |     private void add(T key, int index, int n) {
19 |         int[] value;
20 |         if ((value = this.data.get(key)) != null) {
21 |             value[index] = value[index] + n;
22 |             this.data.put(key, value);
23 |         } else {
24 |             value = new int[3];
25 |             value[index] = value[index] + n;
26 |             this.data.put(key, value);
27 |         }
28 | 
29 |     }
30 | 
31 |     void add(T key, int index) {
32 |         this.add(key, index, 1);
33 |     }
34 | 
35 |     public int size() {
36 |         return this.data.size();
37 |     }
38 | 
39 |     public void remove(T t) {
40 |         this.data.remove(t);
41 |     }
42 | 
43 |     public HashMap<T, int[]> get() {
44 |         return this.data;
45 |     }
46 | }


--------------------------------------------------------------------------------
/src/test/java/wikiCorpus.java:
--------------------------------------------------------------------------------
  1 | import com.hankcs.hanlp.HanLP;
  2 | import com.hankcs.hanlp.corpus.document.sentence.Sentence;
  3 | import com.hankcs.hanlp.corpus.document.sentence.word.IWord;
  4 | import com.hankcs.hanlp.seg.common.Term;
  5 | import com.hankcs.hanlp.tokenizer.NLPTokenizer;
  6 | import com.watt.util.FileUtils;
  7 | import org.jsoup.Jsoup;
  8 | import org.jsoup.nodes.Document;
  9 | import org.junit.Test;
 10 | 
 11 | import java.io.*;
 12 | import java.nio.charset.Charset;
 13 | import java.util.*;
 14 | 
 15 | public class wikiCorpus {
 16 | 
 17 |     /**
 18 |      * 文档总数：1020652
 19 |      *
 20 |      * @throws Exception error stop
 21 |      */
 22 |     @Test
 23 |     public void generateKeySet() throws Exception {
 24 |         BufferedReader reader = FileUtils.getFileReader(new File("D:\\corpus\\data\\zh_wiki_00"), Charset.forName("UTF-8"));
 25 |         MapCount<String> mapCount = new MapCount<>();
 26 |         StringBuilder wordLine = new StringBuilder();
 27 |         String temp = null;
 28 |         int count = 1;
 29 |         while ((temp = reader.readLine()) != null) {
 30 |             wordLine.append(temp);
 31 |             if (wordLine.indexOf("<doc") > -1 && wordLine.indexOf("</doc>") > -1) {
 32 |                 int start = wordLine.indexOf("<doc");
 33 |                 int end = wordLine.indexOf("</doc>") + 6;
 34 |                 String s = wordLine.substring(start, end);
 35 |                 wordLine.delete(start, end);
 36 | 
 37 |                 Document doc = Jsoup.parse(s);
 38 |                 Sentence sentence = NLPTokenizer.ANALYZER.analyze(HanLP.tw2s(doc.select("doc").text()));
 39 |                 for (IWord iWord : Objects.requireNonNull(sentence).wordList) {//                    keySet.add(iWord.getValue());
 40 |                     int[] value = Optional.ofNullable(mapCount.get().get(iWord.getValue())).orElse(new int[3]);
 41 |                     if (value[2] != count) {
 42 |                         value[2] = count; //重置文档计数
 43 |                         value[1] = value[1] + 1; //文档数 +1
 44 |                         value[0] = value[0] + 1; //总词频数 +1
 45 |                     } else {
 46 |                         value[0] = value[0] + 1;
 47 |                     }
 48 |                     mapCount.get().put(iWord.getValue(), value);
 49 |                 }
 50 |                 System.out.println("count:" + count++);
 51 | //                if (count % 500 == 0) {
 52 | //                    ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\keyset\\" + UUID.randomUUID().toString())));
 53 | //                    out.writeObject(mapCount);
 54 | //                    out.flush();
 55 | //                    out.close();
 56 | //                }
 57 |             }
 58 | 
 59 |         }
 60 |         ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\keySetNLP")));
 61 |         out.writeObject(mapCount);
 62 |         out.flush();
 63 |         out.close();
 64 |         System.out.println("count:" + count);
 65 |         reader.close();
 66 |     }
 67 | 
 68 |     @Test
 69 |     public void generateKeySet2() throws Exception {
 70 |         BufferedReader reader = FileUtils.getFileReader(new File("D:\\corpus\\data\\zh_wiki_00"), Charset.forName("UTF-8"));
 71 |         MapCount<String> mapCount = new MapCount<>();
 72 |         StringBuilder wordLine = new StringBuilder();
 73 |         String temp = null;
 74 |         int count = 1;
 75 |         while ((temp = reader.readLine()) != null) {
 76 |             wordLine.append(temp);
 77 |             if (wordLine.indexOf("<doc") > -1 && wordLine.indexOf("</doc>") > -1) {
 78 |                 int start = wordLine.indexOf("<doc");
 79 |                 int end = wordLine.indexOf("</doc>") + 6;
 80 |                 String s = wordLine.substring(start, end);
 81 |                 wordLine.delete(start, end);
 82 | 
 83 |                 Document doc = Jsoup.parse(s);
 84 |                 List<Term> sentence = HanLP.segment(HanLP.tw2s(doc.select("doc").text()));
 85 |                 for (Term iWord : Objects.requireNonNull(sentence)) {//                    keySet.add(iWord.getValue());
 86 |                     int[] value = Optional.ofNullable(mapCount.get().get(iWord.word)).orElse(new int[3]);
 87 |                     if (value[2] != count) {
 88 |                         value[2] = count; //重置文档计数
 89 |                         value[1] = value[1] + 1; //文档数 +1
 90 |                         value[0] = value[0] + 1; //总词频数 +1
 91 |                     } else {
 92 |                         value[0] = value[0] + 1;
 93 |                     }
 94 |                     mapCount.get().put(iWord.word, value);
 95 |                 }
 96 |                 System.out.println("count:" + count++);
 97 | //                if (count % 500 == 0) {
 98 | //                    ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\keyset\\" + UUID.randomUUID().toString())));
 99 | //                    out.writeObject(mapCount);
100 | //                    out.flush();
101 | //                    out.close();
102 | //                }
103 |             }
104 | 
105 |         }
106 |         ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\keySet1")));
107 |         out.writeObject(mapCount);
108 |         out.flush();
109 |         out.close();
110 |         System.out.println("count:" + count);
111 |         reader.close();
112 |     }
113 | 
114 |     @Test
115 |     public void caltfidf() throws Exception {
116 |         ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File("D:\\corpus\\data\\keySet1")));
117 |         MapCount<String> mapCount = (MapCount<String>) in.readObject();
118 |         Map<String, Double> tfidf = new HashMap<String, Double>();
119 |         for (Map.Entry<String, int[]> one : mapCount.get().entrySet()) {
120 |             tfidf.put(one.getKey(), one.getValue()[0] * Math.log(1020652.0 / one.getValue()[2]));
121 |         }
122 |         ObjectOutputStream out = new ObjectOutputStream(new FileOutputStream(new File("D:\\corpus\\data\\tfidf")));
123 |         out.writeObject(tfidf);
124 |         out.flush();
125 |         out.close();
126 |     }
127 |     @Test
128 |     public void testcaltfidf() throws Exception {
129 |         ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File("/home/watt/corpus/data/keySet1")));
130 |         MapCount<String> mapCount = (MapCount<String>) in.readObject();
131 |         Map<String, Double> tfidf = new HashMap<String, Double>();
132 |         for (Map.Entry<String, int[]> one : mapCount.get().entrySet()) {
133 |             tfidf.put(one.getKey(), one.getValue()[0] * Math.log(1020652.0 / one.getValue()[2]));
134 |         }
135 | 
136 |     }
137 |     @Test
138 |     public void readTfidf() throws Exception{
139 |         ObjectInputStream in = new ObjectInputStream(new FileInputStream(new File("/home/watt/corpus/data/tfidf")));
140 |         Map<String,Double> tfidf = (HashMap<String,Double>)in.readObject();
141 |         System.out.println(tfidf.get("的"));
142 |         System.out.println(tfidf.get("是"));
143 |         System.out.println(tfidf.get("为什么"));
144 |     }
145 | 
146 | }
147 | 


--------------------------------------------------------------------------------