├── README.md ├── pom.xml └── src ├── main └── java │ └── cm │ └── eg │ ├── model │ ├── Article.java │ ├── CompareReport.java │ ├── CompareTask.java │ └── WordFreq.java │ └── util │ └── TextUtil.java └── test ├── java └── cm │ └── eg │ ├── DocTest.java │ ├── ProcessTest.java │ └── SegmentTest.java └── resource ├── doc ├── Li_info1.doc ├── Li_info2.doc ├── Li_info3.doc ├── Li_info4.doc ├── Wu_info1.doc ├── Wu_info2.doc ├── Wu_info3.doc └── Wu_info4.doc └── txt └── 小王子.txt /README.md: -------------------------------------------------------------------------------- 1 | # HanLP-TextSimilarity 2 | 3 | 中文分词、统计词频、比对文本相似度 4 | 5 | 使用了汉语言处理包HanLP (https://github.com/hankcs/HanLP) 6 | 7 | 2017.4.18 8 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | cm.eg 7 | text-similarity 8 | 0.0.2 9 | 10 | 11 | 12 | 13 | com.hankcs 14 | hanlp 15 | portable-1.7.3 16 | 17 | 18 | 19 | com.google.guava 20 | guava 21 | 27.1-jre 22 | 23 | 24 | 25 | org.apache.tika 26 | tika-core 27 | 1.20 28 | 29 | 30 | 31 | org.apache.tika 32 | tika-parsers 33 | 1.20 34 | 35 | 36 | org.apache.poi 37 | poi-ooxml-schemas 38 | 39 | 40 | 41 | 42 | 43 | org.apache.poi 44 | ooxml-schemas 45 | 1.3 46 | 47 | 48 | 49 | org.apache.poi 50 | ooxml-security 51 | 1.1 52 | 53 | 54 | 55 | junit 56 | junit 57 | 4.12 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /src/main/java/cm/eg/model/Article.java: -------------------------------------------------------------------------------- 1 | package cm.eg.model; 2 | 3 | import java.util.List; 4 | 5 | import com.hankcs.hanlp.seg.common.Term; 6 | 7 | public class Article { 8 | private String name; 9 | 10 | private String text; 11 | 12 | private List segmentList; 13 | private List wordFreqList; 14 | 15 | public List getSegmentList() { 16 | return segmentList; 17 | } 18 | 19 | public void setSegmentList(List segmentList) { 20 | this.segmentList = segmentList; 21 | } 22 | 23 | public String getName() { 24 | return name; 25 | } 26 | 27 | public void setName(String name) { 28 | this.name = name; 29 | } 30 | 31 | public String getText() { 32 | return text; 33 | } 34 | 35 | public List getWordFreqList() { 36 | return wordFreqList; 37 | } 38 | 39 | public void setText(String text) { 40 | this.text = text; 41 | } 42 | 43 | public void setWordFreqList(List wordFreqList) { 44 | this.wordFreqList = wordFreqList; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/cm/eg/model/CompareReport.java: -------------------------------------------------------------------------------- 1 | package cm.eg.model; 2 | 3 | /** 4 | * 文本相似度的对比报告。 5 | */ 6 | public class CompareReport { 7 | private Article a1; 8 | private Article a2; 9 | 10 | private int top; 11 | 12 | private double similarity; 13 | 14 | public CompareReport(Article a1, Article a2, int top, double similarity) { 15 | this.a1 = a1; 16 | this.a2 = a2; 17 | this.top = top; 18 | this.similarity = similarity; 19 | } 20 | 21 | @Override 22 | 23 | /** 24 | * 返回比较结果的简单报告。 25 | * 可调用getDetail()方法获得详细报告。 26 | */ 27 | public String toString() { 28 | return String.format("%s和%s 相似度:%.0f%%", a1.getName(), a2.getName(), similarity*100); 29 | } 30 | 31 | public String getConsoleDetail() { 32 | StringBuilder sb = new StringBuilder(); 33 | String ls = System.lineSeparator(); 34 | sb.append(a1.getName() +" 长度:"+a1.getText().length() +" 高频词:"); 35 | sb.append(ls); 36 | sb.append(a1.getWordFreqList().subList(0, top)); 37 | sb.append(ls); 38 | sb.append(a2.getName() +" 长度:"+a2.getText().length() + " 高频词:"); 39 | sb.append(ls); 40 | sb.append(a2.getWordFreqList().subList(0, top)); 41 | sb.append(ls); 42 | sb.append("相似度:" + String.format("%.4f",similarity)); 43 | return sb.toString(); 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/cm/eg/model/CompareTask.java: -------------------------------------------------------------------------------- 1 | package cm.eg.model; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Comparator; 5 | import java.util.List; 6 | 7 | /** 8 | * 用于表示比较两篇文章的任务。
9 | * 设置了任务状态;可查看比较的结果。 10 | */ 11 | public class CompareTask { 12 | 13 | private Article a1; 14 | private Article a2; 15 | 16 | private boolean isFinished; 17 | 18 | private CompareReport compareReport; 19 | 20 | // 取高频词前若干位 21 | private static final int TOP_MAX = 20; 22 | 23 | public CompareTask(Article a1, Article a2) { 24 | this.a1 = a1; 25 | this.a2 = a2; 26 | this.isFinished = false; 27 | } 28 | 29 | public void execute() { 30 | // 取高频词前若干位 31 | int top = TOP_MAX; 32 | List freq1 = a1.getWordFreqList().subList(0, top); 33 | List freq2 = a2.getWordFreqList().subList(0, top); 34 | // 获取高频词并集 35 | ArrayList union = new ArrayList<>(freq1); 36 | union.addAll(freq2); 37 | // 生成向量 38 | ArrayList v1 = new ArrayList<>(freq1); 39 | ArrayList v2 = new ArrayList<>(freq2); 40 | for (WordFreq wf : union) { 41 | // 向量包含了高频词并集里的每个词,如果不包含则词频为0 42 | if (!v1.contains(wf)) { 43 | v1.add(new WordFreq(wf.getWord(), 0)); 44 | } 45 | if (!v2.contains(wf)) { 46 | v2.add(new WordFreq(wf.getWord(), 0)); 47 | } 48 | } 49 | // 根据词语排序以对齐向量,方便计算 50 | Comparator strComp = Comparator.comparing(WordFreq::getWord); 51 | v1.sort(strComp); 52 | v2.sort(strComp); 53 | /** 54 | * 代入公式 55 | */ 56 | double vProduct = 0; 57 | int sumSquare1 = 0; 58 | int sumSquare2 = 0; 59 | for (int i = 0; i < v1.size(); ++i) { 60 | int num1 = v1.get(i).getFreq(); 61 | int num2 = v2.get(i).getFreq(); 62 | // 向量点积 63 | vProduct += num1 * num2; 64 | // 求向量模的过程 65 | sumSquare1 += num1 * num1; 66 | sumSquare2 += num2 * num2; 67 | } 68 | // 两向量模的乘积 69 | double normProduct = Math.sqrt(sumSquare1 * sumSquare2); 70 | // 点积除以模乘积 71 | double similarity = vProduct / normProduct; 72 | // 标记完成状态 73 | isFinished = true; 74 | makeReport(a1, a2, similarity); 75 | } 76 | 77 | private void makeReport(Article a1, Article a2, double similarity) { 78 | compareReport = new CompareReport(a1, a2, TOP_MAX, similarity); 79 | 80 | } 81 | 82 | public boolean isFinished() { 83 | return isFinished; 84 | } 85 | 86 | public CompareReport getCompareReport() { 87 | return compareReport; 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/cm/eg/model/WordFreq.java: -------------------------------------------------------------------------------- 1 | package cm.eg.model; 2 | 3 | /** 4 | * 词频。 5 | */ 6 | public class WordFreq { 7 | 8 | private String word; 9 | private int freq; 10 | 11 | public WordFreq(String word, int freq) { 12 | this.word = word; 13 | this.freq = freq; 14 | } 15 | 16 | public String getWord() { 17 | return word; 18 | } 19 | 20 | public int getFreq() { 21 | return freq; 22 | } 23 | 24 | public void setWord(String word) { 25 | this.word = word; 26 | } 27 | 28 | public void setFreq(int freq) { 29 | this.freq = freq; 30 | } 31 | 32 | @Override 33 | public String toString() { 34 | return "[" + word + " : " + freq + "]"; 35 | } 36 | 37 | @Override 38 | public boolean equals(Object obj) { 39 | WordFreq wf = (WordFreq) obj; 40 | return this.getWord().equals(wf.getWord()); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/cm/eg/util/TextUtil.java: -------------------------------------------------------------------------------- 1 | package cm.eg.util; 2 | 3 | import cm.eg.model.Article; 4 | import cm.eg.model.WordFreq; 5 | import com.google.common.collect.HashMultiset; 6 | import com.google.common.collect.Multiset; 7 | import com.google.common.collect.Multiset.Entry; 8 | import com.hankcs.hanlp.HanLP; 9 | import com.hankcs.hanlp.corpus.tag.Nature; 10 | import com.hankcs.hanlp.seg.common.Term; 11 | import org.apache.tika.Tika; 12 | import org.apache.tika.exception.TikaException; 13 | 14 | import java.io.File; 15 | import java.io.IOException; 16 | import java.util.ArrayList; 17 | import java.util.List; 18 | import java.util.function.Predicate; 19 | 20 | public class TextUtil { 21 | 22 | /** 23 | * 从文件中获取数据,封装成对象。会将词频排序,在多个任务的情况下性能得到优化 24 | * 25 | * @param file 26 | */ 27 | public static Article getArticle(File file) throws IOException, TikaException { 28 | String text = getString(file); 29 | List segmentList = getSegmentList(text); 30 | List wordFreqList = getWordFrequency(segmentList); 31 | // 词频从高到低排序 32 | wordFreqList.sort((a, b) -> Integer.compare(b.getFreq(), a.getFreq())); 33 | 34 | // 封装Article 35 | Article article = new Article(); 36 | article.setName(file.getName()); 37 | article.setText(text); 38 | article.setSegmentList(segmentList); 39 | article.setWordFreqList(wordFreqList); 40 | return article; 41 | } 42 | 43 | /** 44 | * 从指定文件读取一整个字符串 45 | * 46 | * @param file 47 | */ 48 | private static String getString(File file) throws IOException, TikaException { 49 | Tika tika = new Tika(); 50 | tika.setMaxStringLength((int) file.length()); 51 | String str = tika.parseToString(file); 52 | return str; 53 | } 54 | 55 | /** 56 | * 将输入的字符串分词处理。 57 | * 58 | * @param text 文本 59 | * @return 切分后的单词 60 | */ 61 | private static List getSegmentList(String text) { 62 | List segmentList = HanLP.segment(text); 63 | // 过滤器 64 | segmentList.removeIf(new Predicate() { 65 | /** 66 | * 过滤掉:长度为1的分词、标点符号 67 | */ 68 | public boolean test(Term term) { 69 | boolean flag = false; 70 | // 长度 71 | String real = term.word.trim(); 72 | if (real.length() <= 1) { 73 | flag = true; 74 | } 75 | // 类型 76 | // 词性以w开头的,为各种标点符号 77 | if (term.nature.startsWith('w')) { 78 | flag = true; 79 | } 80 | // 过滤掉代码 81 | if (term.nature.equals(Nature.nx)) {// 字母专名 82 | flag = true; 83 | } 84 | return flag; 85 | } 86 | }); 87 | return segmentList; 88 | } 89 | 90 | /** 91 | * 根据分词集合统计词频 92 | * @param segmentList 词频集合 93 | */ 94 | public static List getWordFrequency(List segmentList) { 95 | // 统计词频 96 | Multiset wordSet = HashMultiset.create(); 97 | for (Term term : segmentList) {// 放入词汇集合 98 | wordSet.add(term.word); 99 | } 100 | // 从词汇集合取出单词和频次,放入词频集合 101 | List wfList = new ArrayList<>(); 102 | for (Entry entry : wordSet.entrySet()) { 103 | wfList.add(new WordFreq(entry.getElement(), entry.getCount())); 104 | } 105 | return wfList; 106 | } 107 | 108 | /** 109 | * 最长公共子串。 110 | */ 111 | public static String getLCString(String string1, String string2) { 112 | int len1, len2; 113 | char[] str1 = string1.toCharArray(); 114 | char[] str2 = string2.toCharArray(); 115 | 116 | len1 = str1.length; 117 | len2 = str2.length; 118 | int maxLen = len1 > len2 ? len1 : len2; 119 | 120 | int[] max = new int[maxLen];// 保存最长子串长度的数组 121 | int[] maxIndex = new int[maxLen];// 保存最长子串长度最大索引的数组 122 | int[] c = new int[maxLen]; 123 | 124 | int i, j; 125 | for (i = 0; i < len2; i++) { 126 | for (j = len1 - 1; j >= 0; j--) { 127 | if (str2[i] == str1[j]) { 128 | if ((i == 0) || (j == 0)) 129 | c[j] = 1; 130 | else 131 | c[j] = c[j - 1] + 1;// 此时C[j-1]还是上次循环中的值,因为还没被重新赋值 132 | } else { 133 | c[j] = 0; 134 | } 135 | 136 | // 如果是大于那暂时只有一个是最长的,而且要把后面的清0 137 | if (c[j] > max[0]) { 138 | max[0] = c[j]; 139 | maxIndex[0] = j; 140 | 141 | for (int k = 1; k < maxLen; k++) { 142 | max[k] = 0; 143 | maxIndex[k] = 0; 144 | } 145 | } 146 | // 有多个是相同长度的子串 147 | else if (c[j] == max[0]) { 148 | for (int k = 1; k < maxLen; k++) { 149 | if (max[k] == 0) { 150 | max[k] = c[j]; 151 | maxIndex[k] = j; 152 | break; 153 | } 154 | } 155 | } 156 | } 157 | } 158 | // 最长子字符串 159 | StringBuilder lcs = new StringBuilder(); 160 | for (j = 0; j < maxLen; j++) { 161 | if (max[j] > 0) { 162 | for (i = maxIndex[j] - max[j] + 1; i <= maxIndex[j]; i++) 163 | lcs.append((str1[i])); 164 | } 165 | } 166 | return lcs.toString(); 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /src/test/java/cm/eg/DocTest.java: -------------------------------------------------------------------------------- 1 | package cm.eg; 2 | import java.io.File; 3 | import java.io.IOException; 4 | 5 | import org.apache.tika.Tika; 6 | import org.apache.tika.exception.TikaException; 7 | import org.junit.Test; 8 | import org.xml.sax.SAXException; 9 | 10 | /** 11 | * 处理文档测试。 12 | */ 13 | public class DocTest { 14 | 15 | private static String docDir = "doc/"; 16 | 17 | @Test 18 | public void demo1() throws TikaException, IOException, SAXException { 19 | File file = new File(docDir + "Wu_info1.doc"); 20 | System.out.println("length:" + file.length()); 21 | 22 | Tika tika = new Tika(); 23 | System.out.println("tika detect():" + tika.detect(file)); 24 | String text = tika.parseToString(file); 25 | 26 | System.out.println("file string length:" + text.length()); 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/cm/eg/ProcessTest.java: -------------------------------------------------------------------------------- 1 | package cm.eg; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import cm.eg.model.Article; 9 | import cm.eg.model.CompareReport; 10 | import cm.eg.model.CompareTask; 11 | import cm.eg.model.WordFreq; 12 | import cm.eg.util.TextUtil; 13 | 14 | import org.apache.tika.exception.TikaException; 15 | import org.junit.Test; 16 | 17 | import com.hankcs.hanlp.seg.common.Term; 18 | 19 | /** 20 | * 文本相似度处理测试。 21 | */ 22 | public class ProcessTest { 23 | 24 | private static String docDir = "doc/"; 25 | 26 | private static String txtDir = "txt/"; 27 | 28 | /** 29 | * 显示词频最高的若干项 30 | */ 31 | @Test 32 | public void demo1() throws IOException, TikaException { 33 | Article article = TextUtil.getArticle(new File(txtDir + "小王子.txt")); 34 | List wfList = article.getWordFreqList(); 35 | 36 | System.out.println("高频词排名:"); 37 | for (int i = 0; i < 50; ++i) { 38 | System.out.println(wfList.get(i)); 39 | } 40 | } 41 | 42 | 43 | /** 44 | * 向量相似度算法 45 | */ 46 | @Test 47 | public void demo2() throws IOException, TikaException { 48 | // TODO 涉及读写可用多线程FutureTask优化。 49 | Article a1 = TextUtil.getArticle(new File(docDir + "Li_info2.doc")); 50 | Article a2 = TextUtil.getArticle(new File(docDir + "Wu_info2.doc")); 51 | CompareTask task = new CompareTask(a1, a2); 52 | task.execute(); 53 | CompareReport report = task.getCompareReport(); 54 | System.out.println(report); 55 | } 56 | 57 | 58 | /** 59 | * 报告多个测试 60 | */ 61 | @Test 62 | public void demo3() throws IOException, TikaException { 63 | List tasks = new ArrayList(); 64 | 65 | Article a1 = TextUtil.getArticle(new File(docDir + "Wu_info1.doc")); 66 | Article b1 = TextUtil.getArticle(new File(docDir + "Li_info1.doc")); 67 | tasks.add(new CompareTask(a1, b1)); 68 | System.out.println("load 1"); 69 | 70 | Article a2 = TextUtil.getArticle(new File(docDir + "Wu_info2.doc")); 71 | Article b2 = TextUtil.getArticle(new File(docDir + "Li_info2.doc")); 72 | tasks.add(new CompareTask(a2, b2)); 73 | System.out.println("load 2"); 74 | 75 | Article a3 = TextUtil.getArticle(new File(docDir + "Wu_info3.doc")); 76 | Article b3 = TextUtil.getArticle(new File(docDir + "Li_info3.doc")); 77 | tasks.add(new CompareTask(a3, b3)); 78 | System.out.println("load 3"); 79 | 80 | Article a4 = TextUtil.getArticle(new File(docDir + "Wu_info4.doc")); 81 | Article b4 = TextUtil.getArticle(new File(docDir + "Li_info4.doc")); 82 | tasks.add(new CompareTask(a4, b4)); 83 | System.out.println("load 4"); 84 | 85 | for (CompareTask task : tasks) { 86 | task.execute(); 87 | System.out.println(task.getCompareReport().getConsoleDetail()); 88 | System.out.println(); 89 | } 90 | } 91 | 92 | 93 | /** 94 | * 公共子序列 95 | */ 96 | @Test 97 | public void demo4() throws IOException, TikaException { 98 | // 获得分词后的一整串字符 99 | Article a1 = TextUtil.getArticle(new File(docDir + "Wu_info3.doc")); 100 | Article a2 = TextUtil.getArticle(new File(docDir + "Li_info1.doc")); 101 | List termList1 = a1.getSegmentList(); 102 | List termList2 = a2.getSegmentList(); 103 | StringBuilder segStr1 = new StringBuilder(); 104 | StringBuilder segStr2 = new StringBuilder(); 105 | 106 | for (Term term : termList1) { 107 | segStr1.append(term.word); 108 | } 109 | for (Term term : termList2) { 110 | segStr2.append(term.word); 111 | } 112 | // 最长公共子序列 113 | String lcs = ""; 114 | String str1 = segStr1.toString(); 115 | String str2 = segStr2.toString(); 116 | 117 | for (int i = 1; i <= 5; ++i) { 118 | if (i != 1) { 119 | str1 = str1.replaceAll(lcs, ""); 120 | str2 = str2.replaceAll(lcs, ""); 121 | } 122 | lcs = TextUtil.getLCString(str1, str2); 123 | System.out.println("第" + i + "个最长公共子序列(长度" + lcs.length() + "):"); 124 | System.out.println(lcs); 125 | System.out.println(); 126 | } 127 | } 128 | 129 | } 130 | -------------------------------------------------------------------------------- /src/test/java/cm/eg/SegmentTest.java: -------------------------------------------------------------------------------- 1 | package cm.eg; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import com.hankcs.hanlp.seg.common.Term; 5 | import com.hankcs.hanlp.tokenizer.StandardTokenizer; 6 | import org.junit.Test; 7 | 8 | import java.util.List; 9 | 10 | /** 11 | * 分词测试。 12 | */ 13 | public class SegmentTest { 14 | 15 | /** 16 | * 标准分词器。 17 | * 推荐使用这种HanLP静态调用的方式。 18 | */ 19 | @Test 20 | public void demo1() { 21 | List list = HanLP.segment("你好,欢迎使用HanLP汉语处理包!"); 22 | for (Term term : list) { 23 | System.out.println(term.word + " " + term.nature); 24 | } 25 | } 26 | 27 | /** 28 | * 标准分词 29 | */ 30 | @Test 31 | public void demo2() { 32 | List termList = StandardTokenizer.segment("商品和服务"); 33 | System.out.println(termList); 34 | } 35 | 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/test/resource/doc/Li_info1.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Li_info1.doc -------------------------------------------------------------------------------- /src/test/resource/doc/Li_info2.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Li_info2.doc -------------------------------------------------------------------------------- /src/test/resource/doc/Li_info3.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Li_info3.doc -------------------------------------------------------------------------------- /src/test/resource/doc/Li_info4.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Li_info4.doc -------------------------------------------------------------------------------- /src/test/resource/doc/Wu_info1.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Wu_info1.doc -------------------------------------------------------------------------------- /src/test/resource/doc/Wu_info2.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Wu_info2.doc -------------------------------------------------------------------------------- /src/test/resource/doc/Wu_info3.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Wu_info3.doc -------------------------------------------------------------------------------- /src/test/resource/doc/Wu_info4.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/doc/Wu_info4.doc -------------------------------------------------------------------------------- /src/test/resource/txt/小王子.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mokou591/HanLP-TextSimilarity/7a25b858124bca2accb19b5ab0e08428e29ee82c/src/test/resource/txt/小王子.txt --------------------------------------------------------------------------------