├── README.md
├── pom.xml
└── src
├── main
└── java
│ └── cm
│ └── eg
│ ├── model
│ ├── Article.java
│ ├── CompareReport.java
│ ├── CompareTask.java
│ └── WordFreq.java
│ └── util
│ └── TextUtil.java
└── test
├── java
└── cm
│ └── eg
│ ├── DocTest.java
│ ├── ProcessTest.java
│ └── SegmentTest.java
└── resource
├── doc
├── Li_info1.doc
├── Li_info2.doc
├── Li_info3.doc
├── Li_info4.doc
├── Wu_info1.doc
├── Wu_info2.doc
├── Wu_info3.doc
└── Wu_info4.doc
└── txt
└── 小王子.txt
/README.md:
--------------------------------------------------------------------------------
1 | # HanLP-TextSimilarity
2 |
3 | 中文分词、统计词频、比对文本相似度
4 |
5 | 使用了汉语言处理包HanLP (https://github.com/hankcs/HanLP)
6 |
7 | 2017.4.18
8 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 4.0.0
5 |
6 | cm.eg
7 | text-similarity
8 | 0.0.2
9 |
10 |
11 |
12 |
13 | com.hankcs
14 | hanlp
15 | portable-1.7.3
16 |
17 |
18 |
19 | com.google.guava
20 | guava
21 | 27.1-jre
22 |
23 |
24 |
25 | org.apache.tika
26 | tika-core
27 | 1.20
28 |
29 |
30 |
31 | org.apache.tika
32 | tika-parsers
33 | 1.20
34 |
35 |
36 | org.apache.poi
37 | poi-ooxml-schemas
38 |
39 |
40 |
41 |
42 |
43 | org.apache.poi
44 | ooxml-schemas
45 | 1.3
46 |
47 |
48 |
49 | org.apache.poi
50 | ooxml-security
51 | 1.1
52 |
53 |
54 |
55 | junit
56 | junit
57 | 4.12
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/src/main/java/cm/eg/model/Article.java:
--------------------------------------------------------------------------------
1 | package cm.eg.model;
2 |
3 | import java.util.List;
4 |
5 | import com.hankcs.hanlp.seg.common.Term;
6 |
7 | public class Article {
8 | private String name;
9 |
10 | private String text;
11 |
12 | private List segmentList;
13 | private List wordFreqList;
14 |
15 | public List getSegmentList() {
16 | return segmentList;
17 | }
18 |
19 | public void setSegmentList(List segmentList) {
20 | this.segmentList = segmentList;
21 | }
22 |
23 | public String getName() {
24 | return name;
25 | }
26 |
27 | public void setName(String name) {
28 | this.name = name;
29 | }
30 |
31 | public String getText() {
32 | return text;
33 | }
34 |
35 | public List getWordFreqList() {
36 | return wordFreqList;
37 | }
38 |
39 | public void setText(String text) {
40 | this.text = text;
41 | }
42 |
43 | public void setWordFreqList(List wordFreqList) {
44 | this.wordFreqList = wordFreqList;
45 | }
46 |
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/java/cm/eg/model/CompareReport.java:
--------------------------------------------------------------------------------
1 | package cm.eg.model;
2 |
3 | /**
4 | * 文本相似度的对比报告。
5 | */
6 | public class CompareReport {
7 | private Article a1;
8 | private Article a2;
9 |
10 | private int top;
11 |
12 | private double similarity;
13 |
14 | public CompareReport(Article a1, Article a2, int top, double similarity) {
15 | this.a1 = a1;
16 | this.a2 = a2;
17 | this.top = top;
18 | this.similarity = similarity;
19 | }
20 |
21 | @Override
22 |
23 | /**
24 | * 返回比较结果的简单报告。
25 | * 可调用getDetail()方法获得详细报告。
26 | */
27 | public String toString() {
28 | return String.format("%s和%s 相似度:%.0f%%", a1.getName(), a2.getName(), similarity*100);
29 | }
30 |
31 | public String getConsoleDetail() {
32 | StringBuilder sb = new StringBuilder();
33 | String ls = System.lineSeparator();
34 | sb.append(a1.getName() +" 长度:"+a1.getText().length() +" 高频词:");
35 | sb.append(ls);
36 | sb.append(a1.getWordFreqList().subList(0, top));
37 | sb.append(ls);
38 | sb.append(a2.getName() +" 长度:"+a2.getText().length() + " 高频词:");
39 | sb.append(ls);
40 | sb.append(a2.getWordFreqList().subList(0, top));
41 | sb.append(ls);
42 | sb.append("相似度:" + String.format("%.4f",similarity));
43 | return sb.toString();
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/main/java/cm/eg/model/CompareTask.java:
--------------------------------------------------------------------------------
1 | package cm.eg.model;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Comparator;
5 | import java.util.List;
6 |
7 | /**
8 | * 用于表示比较两篇文章的任务。
9 | * 设置了任务状态;可查看比较的结果。
10 | */
11 | public class CompareTask {
12 |
13 | private Article a1;
14 | private Article a2;
15 |
16 | private boolean isFinished;
17 |
18 | private CompareReport compareReport;
19 |
20 | // 取高频词前若干位
21 | private static final int TOP_MAX = 20;
22 |
23 | public CompareTask(Article a1, Article a2) {
24 | this.a1 = a1;
25 | this.a2 = a2;
26 | this.isFinished = false;
27 | }
28 |
29 | public void execute() {
30 | // 取高频词前若干位
31 | int top = TOP_MAX;
32 | List freq1 = a1.getWordFreqList().subList(0, top);
33 | List freq2 = a2.getWordFreqList().subList(0, top);
34 | // 获取高频词并集
35 | ArrayList union = new ArrayList<>(freq1);
36 | union.addAll(freq2);
37 | // 生成向量
38 | ArrayList v1 = new ArrayList<>(freq1);
39 | ArrayList v2 = new ArrayList<>(freq2);
40 | for (WordFreq wf : union) {
41 | // 向量包含了高频词并集里的每个词,如果不包含则词频为0
42 | if (!v1.contains(wf)) {
43 | v1.add(new WordFreq(wf.getWord(), 0));
44 | }
45 | if (!v2.contains(wf)) {
46 | v2.add(new WordFreq(wf.getWord(), 0));
47 | }
48 | }
49 | // 根据词语排序以对齐向量,方便计算
50 | Comparator strComp = Comparator.comparing(WordFreq::getWord);
51 | v1.sort(strComp);
52 | v2.sort(strComp);
53 | /**
54 | * 代入公式
55 | */
56 | double vProduct = 0;
57 | int sumSquare1 = 0;
58 | int sumSquare2 = 0;
59 | for (int i = 0; i < v1.size(); ++i) {
60 | int num1 = v1.get(i).getFreq();
61 | int num2 = v2.get(i).getFreq();
62 | // 向量点积
63 | vProduct += num1 * num2;
64 | // 求向量模的过程
65 | sumSquare1 += num1 * num1;
66 | sumSquare2 += num2 * num2;
67 | }
68 | // 两向量模的乘积
69 | double normProduct = Math.sqrt(sumSquare1 * sumSquare2);
70 | // 点积除以模乘积
71 | double similarity = vProduct / normProduct;
72 | // 标记完成状态
73 | isFinished = true;
74 | makeReport(a1, a2, similarity);
75 | }
76 |
77 | private void makeReport(Article a1, Article a2, double similarity) {
78 | compareReport = new CompareReport(a1, a2, TOP_MAX, similarity);
79 |
80 | }
81 |
82 | public boolean isFinished() {
83 | return isFinished;
84 | }
85 |
86 | public CompareReport getCompareReport() {
87 | return compareReport;
88 | }
89 |
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/java/cm/eg/model/WordFreq.java:
--------------------------------------------------------------------------------
1 | package cm.eg.model;
2 |
3 | /**
4 | * 词频。
5 | */
6 | public class WordFreq {
7 |
8 | private String word;
9 | private int freq;
10 |
11 | public WordFreq(String word, int freq) {
12 | this.word = word;
13 | this.freq = freq;
14 | }
15 |
16 | public String getWord() {
17 | return word;
18 | }
19 |
20 | public int getFreq() {
21 | return freq;
22 | }
23 |
24 | public void setWord(String word) {
25 | this.word = word;
26 | }
27 |
28 | public void setFreq(int freq) {
29 | this.freq = freq;
30 | }
31 |
32 | @Override
33 | public String toString() {
34 | return "[" + word + " : " + freq + "]";
35 | }
36 |
37 | @Override
38 | public boolean equals(Object obj) {
39 | WordFreq wf = (WordFreq) obj;
40 | return this.getWord().equals(wf.getWord());
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/java/cm/eg/util/TextUtil.java:
--------------------------------------------------------------------------------
1 | package cm.eg.util;
2 |
3 | import cm.eg.model.Article;
4 | import cm.eg.model.WordFreq;
5 | import com.google.common.collect.HashMultiset;
6 | import com.google.common.collect.Multiset;
7 | import com.google.common.collect.Multiset.Entry;
8 | import com.hankcs.hanlp.HanLP;
9 | import com.hankcs.hanlp.corpus.tag.Nature;
10 | import com.hankcs.hanlp.seg.common.Term;
11 | import org.apache.tika.Tika;
12 | import org.apache.tika.exception.TikaException;
13 |
14 | import java.io.File;
15 | import java.io.IOException;
16 | import java.util.ArrayList;
17 | import java.util.List;
18 | import java.util.function.Predicate;
19 |
20 | public class TextUtil {
21 |
22 | /**
23 | * 从文件中获取数据,封装成对象。会将词频排序,在多个任务的情况下性能得到优化
24 | *
25 | * @param file
26 | */
27 | public static Article getArticle(File file) throws IOException, TikaException {
28 | String text = getString(file);
29 | List segmentList = getSegmentList(text);
30 | List wordFreqList = getWordFrequency(segmentList);
31 | // 词频从高到低排序
32 | wordFreqList.sort((a, b) -> Integer.compare(b.getFreq(), a.getFreq()));
33 |
34 | // 封装Article
35 | Article article = new Article();
36 | article.setName(file.getName());
37 | article.setText(text);
38 | article.setSegmentList(segmentList);
39 | article.setWordFreqList(wordFreqList);
40 | return article;
41 | }
42 |
43 | /**
44 | * 从指定文件读取一整个字符串
45 | *
46 | * @param file
47 | */
48 | private static String getString(File file) throws IOException, TikaException {
49 | Tika tika = new Tika();
50 | tika.setMaxStringLength((int) file.length());
51 | String str = tika.parseToString(file);
52 | return str;
53 | }
54 |
55 | /**
56 | * 将输入的字符串分词处理。
57 | *
58 | * @param text 文本
59 | * @return 切分后的单词
60 | */
61 | private static List getSegmentList(String text) {
62 | List segmentList = HanLP.segment(text);
63 | // 过滤器
64 | segmentList.removeIf(new Predicate() {
65 | /**
66 | * 过滤掉:长度为1的分词、标点符号
67 | */
68 | public boolean test(Term term) {
69 | boolean flag = false;
70 | // 长度
71 | String real = term.word.trim();
72 | if (real.length() <= 1) {
73 | flag = true;
74 | }
75 | // 类型
76 | // 词性以w开头的,为各种标点符号
77 | if (term.nature.startsWith('w')) {
78 | flag = true;
79 | }
80 | // 过滤掉代码
81 | if (term.nature.equals(Nature.nx)) {// 字母专名
82 | flag = true;
83 | }
84 | return flag;
85 | }
86 | });
87 | return segmentList;
88 | }
89 |
90 | /**
91 | * 根据分词集合统计词频
92 | * @param segmentList 词频集合
93 | */
94 | public static List getWordFrequency(List segmentList) {
95 | // 统计词频
96 | Multiset wordSet = HashMultiset.create();
97 | for (Term term : segmentList) {// 放入词汇集合
98 | wordSet.add(term.word);
99 | }
100 | // 从词汇集合取出单词和频次,放入词频集合
101 | List wfList = new ArrayList<>();
102 | for (Entry entry : wordSet.entrySet()) {
103 | wfList.add(new WordFreq(entry.getElement(), entry.getCount()));
104 | }
105 | return wfList;
106 | }
107 |
108 | /**
109 | * 最长公共子串。
110 | */
111 | public static String getLCString(String string1, String string2) {
112 | int len1, len2;
113 | char[] str1 = string1.toCharArray();
114 | char[] str2 = string2.toCharArray();
115 |
116 | len1 = str1.length;
117 | len2 = str2.length;
118 | int maxLen = len1 > len2 ? len1 : len2;
119 |
120 | int[] max = new int[maxLen];// 保存最长子串长度的数组
121 | int[] maxIndex = new int[maxLen];// 保存最长子串长度最大索引的数组
122 | int[] c = new int[maxLen];
123 |
124 | int i, j;
125 | for (i = 0; i < len2; i++) {
126 | for (j = len1 - 1; j >= 0; j--) {
127 | if (str2[i] == str1[j]) {
128 | if ((i == 0) || (j == 0))
129 | c[j] = 1;
130 | else
131 | c[j] = c[j - 1] + 1;// 此时C[j-1]还是上次循环中的值,因为还没被重新赋值
132 | } else {
133 | c[j] = 0;
134 | }
135 |
136 | // 如果是大于那暂时只有一个是最长的,而且要把后面的清0
137 | if (c[j] > max[0]) {
138 | max[0] = c[j];
139 | maxIndex[0] = j;
140 |
141 | for (int k = 1; k < maxLen; k++) {
142 | max[k] = 0;
143 | maxIndex[k] = 0;
144 | }
145 | }
146 | // 有多个是相同长度的子串
147 | else if (c[j] == max[0]) {
148 | for (int k = 1; k < maxLen; k++) {
149 | if (max[k] == 0) {
150 | max[k] = c[j];
151 | maxIndex[k] = j;
152 | break;
153 | }
154 | }
155 | }
156 | }
157 | }
158 | // 最长子字符串
159 | StringBuilder lcs = new StringBuilder();
160 | for (j = 0; j < maxLen; j++) {
161 | if (max[j] > 0) {
162 | for (i = maxIndex[j] - max[j] + 1; i <= maxIndex[j]; i++)
163 | lcs.append((str1[i]));
164 | }
165 | }
166 | return lcs.toString();
167 | }
168 | }
169 |
--------------------------------------------------------------------------------
/src/test/java/cm/eg/DocTest.java:
--------------------------------------------------------------------------------
1 | package cm.eg;
2 | import java.io.File;
3 | import java.io.IOException;
4 |
5 | import org.apache.tika.Tika;
6 | import org.apache.tika.exception.TikaException;
7 | import org.junit.Test;
8 | import org.xml.sax.SAXException;
9 |
10 | /**
11 | * 处理文档测试。
12 | */
13 | public class DocTest {
14 |
15 | private static String docDir = "doc/";
16 |
17 | @Test
18 | public void demo1() throws TikaException, IOException, SAXException {
19 | File file = new File(docDir + "Wu_info1.doc");
20 | System.out.println("length:" + file.length());
21 |
22 | Tika tika = new Tika();
23 | System.out.println("tika detect():" + tika.detect(file));
24 | String text = tika.parseToString(file);
25 |
26 | System.out.println("file string length:" + text.length());
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/test/java/cm/eg/ProcessTest.java:
--------------------------------------------------------------------------------
1 | package cm.eg;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.ArrayList;
6 | import java.util.List;
7 |
8 | import cm.eg.model.Article;
9 | import cm.eg.model.CompareReport;
10 | import cm.eg.model.CompareTask;
11 | import cm.eg.model.WordFreq;
12 | import cm.eg.util.TextUtil;
13 |
14 | import org.apache.tika.exception.TikaException;
15 | import org.junit.Test;
16 |
17 | import com.hankcs.hanlp.seg.common.Term;
18 |
19 | /**
20 | * 文本相似度处理测试。
21 | */
22 | public class ProcessTest {
23 |
24 | private static String docDir = "doc/";
25 |
26 | private static String txtDir = "txt/";
27 |
28 | /**
29 | * 显示词频最高的若干项
30 | */
31 | @Test
32 | public void demo1() throws IOException, TikaException {
33 | Article article = TextUtil.getArticle(new File(txtDir + "小王子.txt"));
34 | List wfList = article.getWordFreqList();
35 |
36 | System.out.println("高频词排名:");
37 | for (int i = 0; i < 50; ++i) {
38 | System.out.println(wfList.get(i));
39 | }
40 | }
41 |
42 |
43 | /**
44 | * 向量相似度算法
45 | */
46 | @Test
47 | public void demo2() throws IOException, TikaException {
48 | // TODO 涉及读写可用多线程FutureTask优化。
49 | Article a1 = TextUtil.getArticle(new File(docDir + "Li_info2.doc"));
50 | Article a2 = TextUtil.getArticle(new File(docDir + "Wu_info2.doc"));
51 | CompareTask task = new CompareTask(a1, a2);
52 | task.execute();
53 | CompareReport report = task.getCompareReport();
54 | System.out.println(report);
55 | }
56 |
57 |
58 | /**
59 | * 报告多个测试
60 | */
61 | @Test
62 | public void demo3() throws IOException, TikaException {
63 | List tasks = new ArrayList();
64 |
65 | Article a1 = TextUtil.getArticle(new File(docDir + "Wu_info1.doc"));
66 | Article b1 = TextUtil.getArticle(new File(docDir + "Li_info1.doc"));
67 | tasks.add(new CompareTask(a1, b1));
68 | System.out.println("load 1");
69 |
70 | Article a2 = TextUtil.getArticle(new File(docDir + "Wu_info2.doc"));
71 | Article b2 = TextUtil.getArticle(new File(docDir + "Li_info2.doc"));
72 | tasks.add(new CompareTask(a2, b2));
73 | System.out.println("load 2");
74 |
75 | Article a3 = TextUtil.getArticle(new File(docDir + "Wu_info3.doc"));
76 | Article b3 = TextUtil.getArticle(new File(docDir + "Li_info3.doc"));
77 | tasks.add(new CompareTask(a3, b3));
78 | System.out.println("load 3");
79 |
80 | Article a4 = TextUtil.getArticle(new File(docDir + "Wu_info4.doc"));
81 | Article b4 = TextUtil.getArticle(new File(docDir + "Li_info4.doc"));
82 | tasks.add(new CompareTask(a4, b4));
83 | System.out.println("load 4");
84 |
85 | for (CompareTask task : tasks) {
86 | task.execute();
87 | System.out.println(task.getCompareReport().getConsoleDetail());
88 | System.out.println();
89 | }
90 | }
91 |
92 |
93 | /**
94 | * 公共子序列
95 | */
96 | @Test
97 | public void demo4() throws IOException, TikaException {
98 | // 获得分词后的一整串字符
99 | Article a1 = TextUtil.getArticle(new File(docDir + "Wu_info3.doc"));
100 | Article a2 = TextUtil.getArticle(new File(docDir + "Li_info1.doc"));
101 | List termList1 = a1.getSegmentList();
102 | List termList2 = a2.getSegmentList();
103 | StringBuilder segStr1 = new StringBuilder();
104 | StringBuilder segStr2 = new StringBuilder();
105 |
106 | for (Term term : termList1) {
107 | segStr1.append(term.word);
108 | }
109 | for (Term term : termList2) {
110 | segStr2.append(term.word);
111 | }
112 | // 最长公共子序列
113 | String lcs = "";
114 | String str1 = segStr1.toString();
115 | String str2 = segStr2.toString();
116 |
117 | for (int i = 1; i <= 5; ++i) {
118 | if (i != 1) {
119 | str1 = str1.replaceAll(lcs, "");
120 | str2 = str2.replaceAll(lcs, "");
121 | }
122 | lcs = TextUtil.getLCString(str1, str2);
123 | System.out.println("第" + i + "个最长公共子序列(长度" + lcs.length() + "):");
124 | System.out.println(lcs);
125 | System.out.println();
126 | }
127 | }
128 |
129 | }
130 |
--------------------------------------------------------------------------------
/src/test/java/cm/eg/SegmentTest.java:
--------------------------------------------------------------------------------
1 | package cm.eg;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import com.hankcs.hanlp.seg.common.Term;
5 | import com.hankcs.hanlp.tokenizer.StandardTokenizer;
6 | import org.junit.Test;
7 |
8 | import java.util.List;
9 |
10 | /**
11 | * 分词测试。
12 | */
13 | public class SegmentTest {
14 |
15 | /**
16 | * 标准分词器。
17 | * 推荐使用这种HanLP静态调用的方式。
18 | */
19 | @Test
20 | public void demo1() {
21 | List list = HanLP.segment("你好,欢迎使用HanLP汉语处理包!");
22 | for (Term term : list) {
23 | System.out.println(term.word + " " + term.nature);
24 | }
25 | }
26 |
27 | /**
28 | * 标准分词
29 | */
30 | @Test
31 | public void demo2() {
32 | List termList = StandardTokenizer.segment("商品和服务");
33 | System.out.println(termList);
34 | }
35 |
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/src/test/resource/doc/Li_info1.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Li_info1.doc
--------------------------------------------------------------------------------
/src/test/resource/doc/Li_info2.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Li_info2.doc
--------------------------------------------------------------------------------
/src/test/resource/doc/Li_info3.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Li_info3.doc
--------------------------------------------------------------------------------
/src/test/resource/doc/Li_info4.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Li_info4.doc
--------------------------------------------------------------------------------
/src/test/resource/doc/Wu_info1.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Wu_info1.doc
--------------------------------------------------------------------------------
/src/test/resource/doc/Wu_info2.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Wu_info2.doc
--------------------------------------------------------------------------------
/src/test/resource/doc/Wu_info3.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Wu_info3.doc
--------------------------------------------------------------------------------
/src/test/resource/doc/Wu_info4.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Wu_info4.doc
--------------------------------------------------------------------------------
/src/test/resource/txt/小王子.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/txt/小王子.txt
--------------------------------------------------------------------------------