├── README.md └── com └── ext ├── conf ├── Config.java ├── Sentence.java └── StopWords.java ├── db └── MysqlDemo.java ├── dict └── WordDict.java ├── entrance └── Main.java ├── feature ├── GetSimilarityRatio.java ├── NerFrequency.java ├── SenAndTitleSimilarityRatio.java ├── SentenceAndTitle.java ├── SentenceAndTopicWord.java ├── SentenceLengh.java ├── SentencePosition.java ├── TitleClassification.java └── WordFrequencyToSen.java ├── ner ├── ExtractNer.java └── SegWordNer.java ├── predeal ├── CutTextIntoSentence.java └── CutTextIntoSentences.java ├── seg └── word │ ├── SegWord.java │ └── SegWordByNlpir.java ├── tfidf └── WordWeight.java ├── topic ├── utils │ ├── ReadConfigUtil.java │ └── SystemParas.java └── word │ ├── ExtractTopicWord.java │ └── NlpirTest.java └── word └── weight ├── GetAllEntropy.java └── GetWordEntropy.java /README.md: -------------------------------------------------------------------------------- 1 | # ExtractTopicSentence 2 | 基于标题分类的主题句提取 3 | 基于标题分类的主题句提取方法可描述为: 给定一篇新闻报道, 计算标题与新闻主题词集的相似度, 判断标题是否具有提示性。对于提示性标题,抽取新闻报道中与其最相似的句子作为主题句; 否则, 综合利用多种特征计算新闻报道中句子的重要性, 将得分最高的句子作为主题句。 4 | 5 | 1.构造新闻的主题词集 6 | (1)对于爬取的有标签的或关键词的文章,将标签作为主题词集的一部分。 7 | (2)对文本做预处理,分句、分词并去除停用词,词性过滤、词频过滤。 8 | (3)使用信息熵计算文章中每个词的权重。 9 | (4)使用NLPIR关键词抽取工具抽取文章的关键词,然后加上标签作为每篇文章的主题词集。 10 | 11 | 2.标题分类 12 | 将标题分为两类,一类是有提示性,一类是没有提示性。计算标题与主题词集的相似度。 13 | (1)对标题分词,然后计算标题与主题词集中词的重合的个数。将个数作为相似度度量。仅考虑实义词如:动词、名词、命名实体等。这里设置相似度的阀值为1,即重合的个数大于等于1时该标题具有提示性,重合的个数小于1时不具有提示性。 14 | 标题具有提示性标记为1,不具有提示性标记为0。 15 | 16 | 3.主题句特征选取 17 | (1)句子的相对词频 18 | 先根据之前计算出的词的权重,计算出每个句子的所有词的权重的和A作为这个句子的相对词频,然后找出所有句子中具有最大相对词频的句子,对应的词的权重和为B。 19 | Score(Si)=A/B 20 | 21 | (2)句子长度 22 | 主题句倾向于取长句子。 23 | 24 | 这里C=16。 25 | (3)命名实体 26 | 对句子中出现的如who/whom/when/where这里命名实体词做统计,出现这种词越多的句子是主题句的可能性越大。若命名实体词出现的次数为A,句子的长度为B。则: 27 | Score(Si)=A/B. 28 | 29 | (4)句子与标题的重合度 30 | 首先找出句子跟标题重合的词,计算重合的词的权重和为A,然后计算标题的每个词的权重和为B。 31 | Score(Si)=A/B 32 | 33 | (5)句子与标题的相似度 34 | 计算句子与标题句子的相似度: 35 | Score(s)=SimilarityRatio 36 | (6)句子与关键词集的重合度 37 | 计算分词后的句子与关键词集相同的词个数为A,关键词集中的词个数为B。则: 38 | Score=(A/B) 39 | (7)句子位置 40 | 对于一篇文章中的第i条句子Si. 41 | 42 | 这里l=3。需要按段分句。 43 | (8)(还未用)判断句子中是否有触发词,有触发词则Score=1,没有则Score=0; 44 | 事件触发词库:libevent。 45 | 把上面每一步计算出的结果作为句子的特征值。 46 | 47 | 4.主题句提取 48 | 计算每个句子的总分,每个句子的总分是各个特征分量的线性组合。 49 | 50 | ɑ表示标题是否具有提示性,若标题具有提示性则该值为1,否则该值为0。 51 | Wk是(1)-(7)对应特征的权重。Scorek为特征计算值。 52 | Wk的确定需要基于训练语料训练后得到最优组合。 53 | 54 | 5.特征权重参数学习 55 | 使用GIS通用迭代算法进行特征权重的估计。 56 | 确定(1)-(7)的权重分别为1:0.05,2:0.05,3:0.05,4:0.2,5:0.2,6:0.15,7:0.3。 57 | 58 | 59 | 附:文章词权重计算方法: 60 | 信息熵计算公式。不需要基于大量的训练语料,把计算出来的每个值作为该词的权重。 61 | -------------------------------------------------------------------------------- /com/ext/conf/Config.java: -------------------------------------------------------------------------------- 1 | package com.ext.conf; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.UnsupportedEncodingException; 5 | import java.util.Properties; 6 | 7 | /** 8 | * 获取配置文件目录 9 | * 10 | * @author srcb04161 11 | * 12 | */ 13 | public class Config { 14 | 15 | public static String APP_PATH = ""; 16 | public static boolean isWebApp = false; 17 | public static final boolean PRINT = false; 18 | 19 | public static final boolean ENCRYPT = false; 20 | 21 | static { 22 | 23 | APP_PATH = getAppPath() + "/"; 24 | APP_PATH = postAppPath(APP_PATH); 25 | 26 | } 27 | 28 | public static String CONF_PATH = APP_PATH + "conf/"; 29 | public static String SEG_PATH = APP_PATH + "conf/seg/"; 30 | public static String DICT_PATH = APP_PATH + "conf/dict/"; 31 | 32 | static { 33 | SetAppPath(APP_PATH); 34 | } 35 | 36 | public static void SetAppPath(String path) { 37 | SEG_PATH = getFullPath(SEG_PATH); 38 | DICT_PATH = getFullPath(DICT_PATH); 39 | } 40 | 41 | public static String getFullPath(String path) { 42 | if (path == null || path.equals("")) 43 | return null; 44 | path = path.replace('\\', '/'); 45 | 46 | if (path.charAt(0) == '/') 47 | return path; 48 | if (path.indexOf(":") != -1) 49 | return path; 50 | 51 | String curPath = APP_PATH; 52 | if (path.length() >= 2 && path.substring(0, 2).equals("..")) { 53 | int index = curPath.lastIndexOf("/"); 54 | if (index != -1) 55 | curPath = curPath.substring(0, index); 56 | path = path.substring(1, path.length()); 57 | } 58 | 59 | if (path.charAt(0) == '.') 60 | path = path.substring(1, path.length()); 61 | 62 | return curPath + path; 63 | } 64 | 65 | private static String getAppPath() { 66 | String appPath = null; 67 | String clsName = Config.class.getName(); 68 | clsName = clsName.replace('.', '/'); 69 | clsName += ".class"; 70 | 71 | java.net.URL url = Config.class.getClassLoader().getResource(clsName); 72 | try { 73 | appPath = java.net.URLDecoder.decode(url.toString(), "UTF-8"); 74 | } catch (UnsupportedEncodingException e) { 75 | e.printStackTrace(); 76 | System.exit(0); 77 | } 78 | int pos = appPath.indexOf("file:"); 79 | if (pos != -1) 80 | appPath = appPath.substring(pos + 5); 81 | 82 | pos = appPath.indexOf(clsName); 83 | if (pos != -1) 84 | appPath = appPath.substring(0, pos - 1); 85 | 86 | if (appPath.endsWith("!")) 87 | appPath = appPath.substring(0, appPath.lastIndexOf("/")); 88 | 89 | if (appPath.indexOf(":") != -1 && appPath.charAt(0) == '/') 90 | appPath = appPath.substring(1, appPath.length()); 91 | 92 | return appPath; 93 | } 94 | 95 | private static String postAppPath(String path) { 96 | if (!path.isEmpty()) { 97 | int pos = path.lastIndexOf("WEB-INF/"); 98 | if (pos != -1) 99 | isWebApp = true; 100 | else 101 | pos = path.lastIndexOf("bin/"); 102 | if (pos != -1) 103 | path = path.substring(0, pos); 104 | } 105 | return path; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /com/ext/conf/Sentence.java: -------------------------------------------------------------------------------- 1 | package com.ext.conf; 2 | 3 | public class Sentence { 4 | 5 | //句子内容 6 | private String content = null; 7 | //句子权重 8 | private double weight = 0.0; 9 | 10 | public void setContent(String content){ 11 | this.content = content; 12 | } 13 | 14 | public String getContent(){ 15 | return content; 16 | } 17 | 18 | public void setWeight(double weight){ 19 | this.weight = weight; 20 | } 21 | 22 | public double getWeight(){ 23 | return weight; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /com/ext/conf/StopWords.java: -------------------------------------------------------------------------------- 1 | package com.ext.conf; 2 | 3 | import java.util.ArrayList; 4 | 5 | import com.ext.dict.WordDict; 6 | 7 | public class StopWords { 8 | 9 | public static ArrayList cutStopWords(String[] word){ 10 | 11 | ArrayList wordlist = new ArrayList(); 12 | 13 | if(word != null){ 14 | 15 | for(int i = 0;i < word.length;i++){ 16 | 17 | if(WordDict.stopDict.contains(word[i])){ 18 | 19 | continue; 20 | 21 | } 22 | 23 | wordlist.add(word[i]); 24 | } 25 | return wordlist; 26 | 27 | } 28 | 29 | return null; 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /com/ext/db/MysqlDemo.java: -------------------------------------------------------------------------------- 1 | package com.ext.db; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.FileWriter; 5 | import java.sql.DriverManager; 6 | import java.sql.ResultSet; 7 | import java.sql.SQLException; 8 | import java.sql.Connection; 9 | import java.sql.Statement; 10 | import java.util.ArrayList; 11 | import java.util.HashSet; 12 | 13 | import com.ext.dict.WordDict; 14 | import com.ext.entrance.Main; 15 | import com.ext.predeal.CutTextIntoSentence; 16 | 17 | public class MysqlDemo { 18 | public static void main(String[] args) throws Exception { 19 | Connection conn = null; 20 | String sql; 21 | // MySQL的JDBC URL编写方式:jdbc:mysql://主机名称:连接端口/数据库的名称?参数=值 22 | // 避免中文乱码要指定useUnicode和characterEncoding 23 | // 执行数据库操作之前要在数据库管理系统上创建一个数据库,名字自己定, 24 | // 下面语句之前就要先创建javademo数据库 25 | String url = "jdbc:mysql://172.25.81.201:5029/distribute_crawler?" 26 | + "user=sa&password=rapidminer_2g&useUnicode=true&characterEncoding=UTF8&allowMultiQueries=true"; 27 | 28 | try { 29 | BufferedWriter writer = new BufferedWriter(new FileWriter(args[0])); 30 | BufferedWriter writer2 = new BufferedWriter(new FileWriter(args[1])); 31 | // 之所以要使用下面这条语句,是因为要使用MySQL的驱动,所以我们要把它驱动起来, 32 | // 可以通过Class.forName把它加载进去,也可以通过初始化来驱动起来,下面三种形式都可以 33 | Class.forName("com.mysql.jdbc.Driver");// 动态加载mysql驱动 34 | // or: 35 | // com.mysql.jdbc.Driver driver = new com.mysql.jdbc.Driver(); 36 | // or: 37 | // new com.mysql.jdbc.Driver(); 38 | 39 | System.out.println("成功加载MySQL驱动程序"); 40 | // 一个Connection代表一个数据库连接 41 | conn = DriverManager.getConnection(url); 42 | // Statement里面带有很多方法,比如executeUpdate可以实现插入,更新和删除等 43 | Statement stmt = conn.createStatement(); 44 | //sql = "select * from toutiao_article_tb where docId = '" + "6298814404519526658" + "'"; 45 | sql = "select * from toutiao_article_tb"; 46 | ResultSet result = stmt.executeQuery(sql);// executeQuery会返回结果的集合,否则返回空值 47 | int count = 0; 48 | ArrayList sentenceResult = null; 49 | ArrayList sentenceCutResult = null; 50 | while (result!=null && result.next()) { 51 | 52 | int Id = result.getInt(1); 53 | String content = result.getString(2); 54 | String docId = result.getString(3); 55 | String keyWords = result.getString(4); 56 | int reviewCount = result.getInt(5); 57 | String source = result.getString(6); 58 | String tags = result.getString(7); 59 | String time = result.getString(8); 60 | String title = result.getString(9); 61 | String Url = result.getString(10); 62 | 63 | //获取关键词 64 | ArrayList lableList = new ArrayList(); 65 | if(keyWords != null || keyWords.length() > 0 || keyWords != ""){ 66 | String[] keyword = keyWords.split(","); 67 | for(int i = 0;i < keyword.length;i++){ 68 | lableList.add(keyword[i]); 69 | } 70 | } 71 | 72 | //筛选关于手机的新闻 73 | String[] tagArray = tags.split(";"); 74 | for(int i = 0;i < tagArray.length;i++){ 75 | if(WordDict.tagDict.contains(tagArray[i])){ 76 | count++; 77 | //输出句子和对应的权重 78 | sentenceResult = Main.getTopicSentence(content, title, lableList); 79 | for(String output : sentenceResult){ 80 | writer.write(docId + "\t" + output); 81 | writer.newLine(); 82 | } 83 | //输出文章分句后的结果 84 | writer2.write(docId + "\t" + 0 + "\t" + "Title" + ":" + title); 85 | writer2.newLine(); 86 | sentenceCutResult = CutTextIntoSentence.cutTextIntoSentences(content,1); 87 | if(sentenceCutResult != null && sentenceCutResult.size() > 0){ 88 | for(String sentence : sentenceCutResult){ 89 | String[] senArray = sentence.split(":"); 90 | if(senArray.length >=2){ 91 | String sentenceContent = senArray[0]; 92 | String idx = senArray[1]; 93 | writer2.write(docId + "\t" + idx + "\t" + sentenceContent); 94 | writer2.newLine(); 95 | } 96 | } 97 | } 98 | 99 | break; 100 | } 101 | } 102 | } 103 | writer.close(); 104 | writer2.close(); 105 | //System.out.println(count); 106 | // if (result != -1) { 107 | // System.out.println("创建数据表成功"); 108 | // sql = "insert into student(NO,name) values('2012001','陶伟基')"; 109 | // result = stmt.executeUpdate(sql); 110 | // sql = "insert into student(NO,name) values('2012002','周小俊')"; 111 | // result = stmt.executeUpdate(sql); 112 | // sql = "select * from student"; 113 | // ResultSet rs = stmt.executeQuery(sql);// executeQuery会返回结果的集合,否则返回空值 114 | // System.out.println("学号\t姓名"); 115 | // while (rs.next()) { 116 | // System.out 117 | // .println(rs.getString(1) + "\t" + rs.getString(2));// 入如果返回的是int类型可以用getInt() 118 | // } 119 | // } 120 | } catch (SQLException e) { 121 | System.out.println("MySQL操作错误"); 122 | e.printStackTrace(); 123 | } catch (Exception e) { 124 | e.printStackTrace(); 125 | } finally { 126 | conn.close(); 127 | } 128 | 129 | } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /com/ext/dict/WordDict.java: -------------------------------------------------------------------------------- 1 | package com.ext.dict; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.InputStreamReader; 7 | import java.util.ArrayList; 8 | import java.util.HashSet; 9 | 10 | import com.ext.conf.Config; 11 | 12 | 13 | /** 14 | * 获取外部资源,如特征词库等 15 | * 16 | * @author srcb04161 17 | * 18 | */ 19 | public class WordDict { 20 | 21 | // static String featureDictPath = Config.DICT_PATH + "aspect_feature.jd"; 22 | // 23 | // static String negDictPath = Config.DICT_PATH + "aspect_neg.jd"; 24 | // 25 | // static String posDictPath = Config.DICT_PATH + "aspect_pos.jd"; 26 | 27 | static String stopDictPath = Config.DICT_PATH + "stop.words"; 28 | 29 | static String tagDictPath = Config.DICT_PATH + "tags.txt"; 30 | 31 | // static String negationDictPath = Config.DICT_PATH + "aspect_negation.bk"; 32 | 33 | // public static HashSet featureDict = readTxtFile(featureDictPath); 34 | // 35 | // public static HashSet negDict = readTxtFile(negDictPath); 36 | // 37 | // public static HashSet posDict = readTxtFile(posDictPath); 38 | 39 | public static HashSet stopDict = readTxtFile(stopDictPath); 40 | 41 | public static HashSet tagDict = readTxtFile(tagDictPath); 42 | 43 | // public static HashSet negationDict = readTxtFile(negationDictPath); 44 | 45 | /* 46 | * 读取文件内容 47 | */ 48 | public static HashSet readTxtFile(String filePath) { 49 | 50 | HashSet DictSet = new HashSet(); 51 | 52 | try { 53 | 54 | String encoding = "utf8"; 55 | 56 | File file = new File(filePath); 57 | 58 | if (file.isFile() && file.exists()) { // 判断文件是否存在 59 | 60 | InputStreamReader read = new InputStreamReader( 61 | new FileInputStream(file), encoding);// 设定文件格式 62 | 63 | BufferedReader bufferedReader = new BufferedReader(read); 64 | 65 | String lineTxt = null; 66 | 67 | while ((lineTxt = bufferedReader.readLine()) != null) { 68 | 69 | DictSet.add(lineTxt.trim()); 70 | 71 | } 72 | 73 | read.close(); 74 | 75 | } else { 76 | 77 | System.out.println("文件不存在!"); 78 | 79 | } 80 | } catch (Exception e) { 81 | 82 | System.out.println("读取文件内容出错!"); 83 | 84 | e.printStackTrace(); 85 | 86 | } 87 | return DictSet; 88 | 89 | } 90 | 91 | } 92 | -------------------------------------------------------------------------------- /com/ext/entrance/Main.java: -------------------------------------------------------------------------------- 1 | package com.ext.entrance; 2 | 3 | import java.io.FileNotFoundException; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.Arrays; 7 | import java.util.HashMap; 8 | import java.util.Iterator; 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Map.Entry; 13 | 14 | import com.ext.conf.Sentence; 15 | import com.ext.feature.GetSimilarityRatio; 16 | import com.ext.feature.SenAndTitleSimilarityRatio; 17 | import com.ext.feature.SentenceAndTopicWord; 18 | import com.ext.feature.SentencePosition; 19 | import com.ext.feature.TitleClassification; 20 | import com.ext.tfidf.WordWeight; 21 | 22 | public class Main { 23 | 24 | public static ArrayList getTopicSentence(String content, String title, ArrayList lableList) throws Exception { 25 | // TODO Auto-generated method stub 26 | 27 | //String title = "阿里荣威联合发布首款互联网汽车 下月开卖"; 28 | //String title = "谭维克将赴市社科院担任领导职务"; 29 | // String reviewContent = null; 30 | // String filepath = "D:\\ExtTopic\\testfile\\test4.txt"; 31 | // try { 32 | // reviewContent = WordWeight.readFile(filepath); 33 | // } catch (FileNotFoundException e) { 34 | // // TODO Auto-generated catch block 35 | // e.printStackTrace(); 36 | // } catch (IOException e) { 37 | // // TODO Auto-generated catch block 38 | // e.printStackTrace(); 39 | // } 40 | //System.out.println(s); 41 | // ArrayList lableList = new ArrayList(); 42 | // 43 | // lableList.add("阿里"); 44 | // lableList.add("无人驾驶"); 45 | // lableList.add("互联网"); 46 | //lableList.add("Skin"); 47 | 48 | //判断标题是否有提示性,如果标题具有提示性可将标题作为主题句 49 | int titleCategory = TitleClassification.titleClassification(title, content, lableList); 50 | 51 | ArrayList> featureResult = SentencePosition.sentencePosition(lableList, content, title); 52 | 53 | Map senWeightMap = new HashMap(); 54 | 55 | int length = featureResult.size(); 56 | 57 | String[] sentence = new String[length]; 58 | 59 | double[] fvalue = new double[length]; 60 | 61 | int idx = 0; 62 | 63 | //保存句子内容和句子对应权重 64 | List result = new ArrayList(); 65 | 66 | Sentence item; 67 | 68 | for(ArrayList sentenceList : featureResult){ 69 | 70 | double f1 = 0.05 * Double.valueOf(sentenceList.get(1)); 71 | 72 | double f2 = 0.05 * Double.valueOf(sentenceList.get(2)); 73 | 74 | double f3 = 0.05 * Double.valueOf(sentenceList.get(3)); 75 | 76 | double f4 = titleCategory * 0.2 * Double.valueOf(sentenceList.get(4)); 77 | 78 | double f5 = titleCategory * 0.2 * Double.valueOf(sentenceList.get(5)); 79 | 80 | double f6 = 0.15 * Double.valueOf(sentenceList.get(6)); 81 | 82 | double f7 = 0.3 * Double.valueOf(sentenceList.get(7)); 83 | 84 | double sentenceWeight = f1 + f2 + f3 + f4 + f5 + f6 + f7; 85 | 86 | sentence[idx] = sentenceList.get(0); 87 | 88 | fvalue[idx] = sentenceWeight; 89 | 90 | idx++; 91 | 92 | //System.out.println(sentenceList.get(0) + ":" + sentenceWeight); 93 | 94 | senWeightMap.put(sentenceList.get(0), sentenceWeight); 95 | 96 | item = new Sentence(); 97 | 98 | item.setContent(sentenceList.get(0)); 99 | 100 | item.setWeight(sentenceWeight); 101 | 102 | result.add(item); 103 | 104 | } 105 | 106 | //计算两个句子的相似度,若两个句子的相似度大于0.6则输出权重大的句子,若权重相同则两个句子都输出 107 | // ArrayList sentenceArray = new ArrayList(); 108 | // ArrayList ffvalue = new ArrayList(); 109 | // double[][] ratio = new double[sentence.length][sentence.length]; 110 | // 111 | // for(int m = 0;m < sentence.length;m++){ 112 | // for(int n = m;n < sentence.length;n++){ 113 | // 114 | // double similarityRatio = GetSimilarityRatio.getSimilarityRatio(sentence[m], sentence[n]); 115 | // ratio[m][n] = similarityRatio; 116 | // 117 | // if(similarityRatio > 0.6){ 118 | // if(fvalue[m] < fvalue[n]){ 119 | // sentence[m] = sentence[n]; 120 | // fvalue[m] = fvalue[n]; 121 | // } 122 | // } 123 | // sentenceArray.add(sentence[m]); 124 | // ffvalue.add(fvalue[m]); 125 | // 126 | // } 127 | // } 128 | 129 | 130 | //使用冒泡排序实现句子与权重同时排序 131 | for(int i = 0;i < sentence.length;i++){ 132 | for(int j = i;j < sentence.length;j++){ 133 | if(fvalue[i] < fvalue[j]){ 134 | double value = fvalue[j]; 135 | fvalue[j] = fvalue[i]; 136 | fvalue[i] = value; 137 | 138 | String sentenceStr = sentence[j]; 139 | sentence[j] = sentence[i]; 140 | sentence[i] = sentenceStr; 141 | 142 | } 143 | } 144 | } 145 | 146 | //保存最终结果 147 | ArrayList sentenceResult = new ArrayList(); 148 | 149 | //如果标题具有提示性则将标题作为主题句 150 | if(titleCategory == 1){ 151 | sentenceResult.add(title + "\t" + 10.0); 152 | //System.out.println(title + ":" + 10.0); 153 | } 154 | //文章总句数若少于10句则全部为主题句,若多于10句则取1/3为主题句 155 | // int sentenceNum = sentence.length; 156 | // 157 | // if(sentenceNum > 10){ 158 | // 159 | // sentenceNum = sentenceNum/3; 160 | // 161 | // } 162 | 163 | for(int k = 0;k < sentence.length;k++){ 164 | sentenceResult.add(sentence[k] + "\t" + fvalue[k]); 165 | //System.out.println(sentence[k] + ":" + fvalue[k]); 166 | } 167 | return sentenceResult; 168 | 169 | // List keyList = new LinkedList(); 170 | // keyList.addAll(senWeightMap.keySet()); 171 | // List valueList = new LinkedList(); 172 | // valueList.addAll(senWeightMap.values()); 173 | // for(int i = 0;i < valueList.size();i++){ 174 | // for(int j = i+1;j < valueList.size();j++){ 175 | // if(valueList.get(j) > valueList.get(i)){ 176 | // valueList.set(j, valueList.get(i)); 177 | // valueList.set(i, valueList.get(j)); 178 | // 179 | // keyList.set(j, keyList.get(i)); 180 | // keyList.set(i, keyList.get(j)); 181 | // } 182 | // } 183 | // } 184 | // 185 | // Map sortWeight = new HashMap(); 186 | // 187 | // for(int k = 0;k < keyList.size();k++){ 188 | // System.out.println(keyList.get(k)+ ":" +valueList.get(k)); 189 | // sortWeight.put(keyList.get(k), valueList.get(k)); 190 | // } 191 | // 192 | // Iterator iter = sortWeight.entrySet().iterator(); 193 | // while(iter.hasNext()){ 194 | // 195 | // Map.Entry entry = (Entry) iter.next(); 196 | // String key = entry.getKey(); 197 | // Double value = entry.getValue(); 198 | // //System.out.println(key + ":" + value); 199 | // 200 | // } 201 | 202 | } 203 | 204 | private static boolean isNaN(String result) { 205 | // TODO Auto-generated method stub 206 | if(result.equals("NaN") || result.equals("Infinity")){ 207 | return true; 208 | } 209 | 210 | return false; 211 | } 212 | 213 | } 214 | -------------------------------------------------------------------------------- /com/ext/feature/GetSimilarityRatio.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | public class GetSimilarityRatio { 4 | 5 | private static int compare(String str, String target) { 6 | int d[][]; // 矩阵 7 | int n = str.length(); 8 | int m = target.length(); 9 | int i; // 遍历str的 10 | int j; // 遍历target的 11 | char ch1; // str的 12 | char ch2; // target的 13 | int temp; // 记录相同字符,在某个矩阵位置值的增量,不是0就是1 14 | 15 | if (n == 0) { 16 | return m; 17 | } 18 | 19 | if (m == 0) { 20 | return n; 21 | } 22 | d = new int[n + 1][m + 1]; 23 | for (i = 0; i <= n; i++) { // 初始化第一列 24 | d[i][0] = i; 25 | } 26 | 27 | for (j = 0; j <= m; j++) { // 初始化第一行 28 | d[0][j] = j; 29 | } 30 | 31 | for (i = 1; i <= n; i++) { // 遍历str 32 | ch1 = str.charAt(i - 1); 33 | // 去匹配target 34 | for (j = 1; j <= m; j++) { 35 | ch2 = target.charAt(j - 1); 36 | if (ch1 == ch2) { 37 | temp = 0; 38 | } else { 39 | temp = 1; 40 | } 41 | 42 | // 左边+1,上边+1, 左上角+temp取最小 43 | d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] 44 | + temp); 45 | } 46 | } 47 | 48 | return d[n][m]; 49 | } 50 | 51 | private static int min(int one, int two, int three) { 52 | return (one = one < two ? one : two) < three ? one : three; 53 | } 54 | 55 | /** 56 | * 获取两字符串的相似度 57 | * 58 | * @param str 59 | * @param target 60 | * 61 | * @return 62 | */ 63 | public static double getSimilarityRatio(String str, String target) { 64 | return 1 - (double) compare(str, target) 65 | / Math.max(str.length(), target.length()); 66 | 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /com/ext/feature/NerFrequency.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | import java.util.ArrayList; 4 | 5 | import com.ext.ner.ExtractNer; 6 | import com.ext.seg.word.SegWord; 7 | /** 8 | * 3 — 命名实体特征 9 | * @author srcb04161 10 | * 11 | */ 12 | public class NerFrequency { 13 | 14 | public static ArrayList> getNerFrequency(String reviewContent){ 15 | 16 | ArrayList> featureResult = SentenceLengh.getSentenceLength(reviewContent); 17 | 18 | for(ArrayList sentenceList : featureResult){ 19 | 20 | int length = sentenceList.get(0).length(); 21 | 22 | if(length == 0){ 23 | 24 | length = 1; 25 | 26 | } 27 | 28 | ArrayList nerResult = ExtractNer.getNerResultByJieba(sentenceList.get(0)); 29 | 30 | String[] word = SegWord.segmentByJieba(sentenceList.get(0)); 31 | 32 | int count = 0; 33 | 34 | for(int i = 0;i < word.length;i++){ 35 | 36 | if(word[i].length() < nerResult.get(i).length()){ 37 | 38 | count++; 39 | 40 | } 41 | 42 | } 43 | 44 | double NerScore = (double)count/length; 45 | 46 | sentenceList.add(String.valueOf(NerScore)); 47 | 48 | } 49 | return featureResult; 50 | 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /com/ext/feature/SenAndTitleSimilarityRatio.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | import java.util.ArrayList; 4 | /** 5 | * 计算句子与标题的相似度特征 6 | * @author srcb04161 7 | * 8 | */ 9 | public class SenAndTitleSimilarityRatio { 10 | 11 | public static ArrayList> getSenAndTitleSimilarityRatio(String reviewContent, String title){ 12 | 13 | ArrayList> featureResult = SentenceAndTitle.getSimilarityWithSenAndTitle(reviewContent, title); 14 | 15 | for(ArrayList sentenceList : featureResult){ 16 | 17 | double similarityRatio = GetSimilarityRatio.getSimilarityRatio(sentenceList.get(0), title); 18 | 19 | sentenceList.add(String.valueOf(similarityRatio)); 20 | 21 | } 22 | return featureResult; 23 | 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /com/ext/feature/SentenceAndTitle.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | import java.util.ArrayList; 4 | 5 | import com.ext.conf.StopWords; 6 | import com.ext.seg.word.SegWord; 7 | import com.ext.word.weight.GetAllEntropy; 8 | import com.ext.word.weight.GetWordEntropy; 9 | /** 10 | * 4 - 句子与标题的重合度特征 11 | * @author srcb04161 12 | * 13 | */ 14 | public class SentenceAndTitle { 15 | 16 | public static ArrayList> getSimilarityWithSenAndTitle(String reviewContent, String title){ 17 | 18 | String[] word = SegWord.segmentByJieba(title); 19 | 20 | ArrayList wordClean = StopWords.cutStopWords(word); 21 | 22 | double titleEntropy = GetWordEntropy.getAllEntropy(word); 23 | 24 | if(titleEntropy == 0.0){ 25 | 26 | titleEntropy = 1.0; 27 | 28 | } 29 | 30 | ArrayList> featureResult = NerFrequency.getNerFrequency(reviewContent); 31 | 32 | for(ArrayList sentenceList : featureResult){ 33 | 34 | String[] wordStr = SegWord.segmentByJieba(sentenceList.get(0)); 35 | 36 | ArrayList wordStrClean = StopWords.cutStopWords(wordStr); 37 | 38 | ArrayList strResult = new ArrayList(); 39 | //找出句子与标题相同的词 40 | for(String str1 : wordStrClean){ 41 | 42 | for(String str2 : wordClean){ 43 | 44 | if(str1.equals(str2)){ 45 | 46 | strResult.add(str1); 47 | 48 | } 49 | 50 | } 51 | 52 | } 53 | 54 | String[] allWordEntropy = GetAllEntropy.getAllEntropy(reviewContent); 55 | 56 | double frequencySen = 0.0; 57 | //获取相同词的权重和 58 | for(int j = 0;j < strResult.size();j++){ 59 | 60 | for(int m = 0;m < allWordEntropy.length;m++){ 61 | 62 | String[] worden = allWordEntropy[m].split(":"); 63 | 64 | if(strResult.get(j).equals(worden[0])){ 65 | 66 | frequencySen = frequencySen + Double.valueOf(worden[1]); 67 | 68 | } 69 | 70 | } 71 | 72 | } 73 | 74 | double similaryWithSenAndTitle = (double)frequencySen/titleEntropy; 75 | 76 | sentenceList.add(String.valueOf(similaryWithSenAndTitle)); 77 | 78 | } 79 | return featureResult; 80 | 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /com/ext/feature/SentenceAndTopicWord.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashSet; 5 | 6 | import com.ext.conf.StopWords; 7 | import com.ext.seg.word.SegWord; 8 | import com.ext.topic.word.ExtractTopicWord; 9 | /** 10 | * 句子与关键词集重合度 11 | * @author srcb04161 12 | * 13 | */ 14 | public class SentenceAndTopicWord { 15 | 16 | public static ArrayList> sentenceAndTopicWord(ArrayList lableList, String reviewContent, String title) throws Exception{ 17 | 18 | HashSet keywordSet = ExtractTopicWord.getKeyWords(reviewContent, lableList); 19 | 20 | int size = keywordSet.size(); 21 | 22 | if(size == 0){ 23 | 24 | size = 1; 25 | 26 | } 27 | 28 | ArrayList> featureResult = SenAndTitleSimilarityRatio.getSenAndTitleSimilarityRatio(reviewContent, title); 29 | 30 | for(ArrayList sentenceList : featureResult){ 31 | 32 | String[] word = SegWord.segmentByJieba(sentenceList.get(0)); 33 | 34 | ArrayList wordlist = StopWords.cutStopWords(word); 35 | 36 | String[] wordclean = new String[wordlist.size()]; 37 | 38 | for(int i = 0;i < wordlist.size();i++){ 39 | 40 | wordclean[i] = wordlist.get(i); 41 | 42 | } 43 | 44 | int count = 0; 45 | 46 | for(int j = 0;j < wordclean.length;j++){ 47 | 48 | for(String keyword : keywordSet){ 49 | 50 | if(wordclean[j].equals(keyword)){ 51 | 52 | count++; 53 | 54 | } 55 | 56 | } 57 | 58 | } 59 | 60 | double sentenceWord = (double)count/size; 61 | 62 | sentenceList.add(String.valueOf(sentenceWord)); 63 | 64 | } 65 | return featureResult; 66 | 67 | } 68 | 69 | } 70 | -------------------------------------------------------------------------------- /com/ext/feature/SentenceLengh.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | import java.util.ArrayList; 4 | /** 5 | * 2 - 句子长度特征 6 | * @author srcb04161 7 | * 8 | */ 9 | public class SentenceLengh { 10 | 11 | public static ArrayList> getSentenceLength(String reviewContent){ 12 | 13 | ArrayList> frequencyResult = WordFrequencyToSen.getWordFrequencyToSen(reviewContent); 14 | 15 | for(ArrayList senList : frequencyResult){ 16 | 17 | int senLength = senList.get(0).length(); 18 | 19 | if(senLength > 16){ 20 | 21 | senList.add("1.0"); 22 | 23 | }else{ 24 | 25 | senList.add("0.0"); 26 | } 27 | 28 | } 29 | return frequencyResult; 30 | 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /com/ext/feature/SentencePosition.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | import java.util.ArrayList; 4 | 5 | import com.ext.predeal.CutTextIntoSentence; 6 | /** 7 | * 句子位置特征 8 | */ 9 | public class SentencePosition { 10 | 11 | public static ArrayList> sentencePosition(ArrayList lableList, String reviewContent, String title) throws Exception{ 12 | 13 | ArrayList> featureResult = SentenceAndTopicWord.sentenceAndTopicWord(lableList, reviewContent, title); 14 | 15 | ArrayList sentenceIdx = CutTextIntoSentence.cutTextIntoSentences(reviewContent,2); 16 | 17 | for(ArrayList sentenceList : featureResult){ 18 | 19 | for(String sentenceidx : sentenceIdx){ 20 | 21 | String[] sentence = sentenceidx.split(":"); 22 | 23 | String senContent = sentence[0]; 24 | //获取段落号 25 | String paragraphIdx = sentence[1]; 26 | //获取句子号 27 | String idx = sentence[2]; 28 | //获取句子在该段落中的位置号 29 | String idx2 = sentence[3]; 30 | 31 | double score = 0.0; 32 | 33 | if(sentenceList.get(0).equals(senContent)){ 34 | //对于一篇文章中的第i条句子,如果i<3或者该句是段首句 35 | if(Integer.parseInt(idx) < 3 || Integer.parseInt(idx2)==1){ 36 | 37 | score = 1.0; 38 | 39 | }else{ 40 | 41 | int sentenceNum = sentenceIdx.size(); 42 | 43 | score = 1 - (double)Math.log(Integer.parseInt(idx))/Math.log(sentenceNum); 44 | 45 | } 46 | 47 | } 48 | 49 | sentenceList.add(String.valueOf(score)); 50 | 51 | } 52 | 53 | } 54 | return featureResult; 55 | 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /com/ext/feature/TitleClassification.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashSet; 5 | 6 | import com.ext.conf.StopWords; 7 | import com.ext.seg.word.SegWord; 8 | import com.ext.topic.word.ExtractTopicWord; 9 | 10 | public class TitleClassification { 11 | 12 | public static void main(String[] args) { 13 | // TODO Auto-generated method stub 14 | 15 | } 16 | 17 | public static int titleClassification(String title, String filepath, ArrayList lableList){ 18 | 19 | int category = 0; 20 | 21 | String[] titleword = SegWord.segmentByJieba(title); 22 | 23 | ArrayList titleList = StopWords.cutStopWords(titleword); 24 | 25 | try { 26 | HashSet keywords = ExtractTopicWord.getKeyWords(filepath, lableList); 27 | 28 | int count = 0; 29 | //判断标题词与关键词集的重合度,若相同的词数大于等于1则该标题具有提示性 30 | for(int i = 0;i < titleList.size();i++){ 31 | 32 | if(keywords.contains(titleList.get(i))){ 33 | 34 | count++; 35 | 36 | } 37 | 38 | } 39 | 40 | if(count >= 1){ 41 | 42 | category = 1; 43 | 44 | } 45 | } catch (Exception e) { 46 | // TODO Auto-generated catch block 47 | e.printStackTrace(); 48 | } 49 | 50 | return category; 51 | 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /com/ext/feature/WordFrequencyToSen.java: -------------------------------------------------------------------------------- 1 | package com.ext.feature; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import com.ext.conf.StopWords; 7 | import com.ext.dict.WordDict; 8 | import com.ext.predeal.CutTextIntoSentence; 9 | import com.ext.seg.word.SegWord; 10 | import com.ext.word.weight.GetAllEntropy; 11 | import com.ext.word.weight.GetWordEntropy; 12 | import com.ext.conf.Sentence; 13 | 14 | /** 15 | * 1 - 句子的相对词频特征 16 | * @author srcb04161 17 | * 18 | */ 19 | public class WordFrequencyToSen { 20 | 21 | public static ArrayList> getWordFrequencyToSen(String reviewContent){ 22 | 23 | ArrayList sentenceList = CutTextIntoSentence.cutTextIntoSentences(reviewContent,0); 24 | 25 | //获取该篇文档每个词的权重 26 | String[] WordEntropy = GetAllEntropy.getAllEntropy(reviewContent); 27 | 28 | ArrayList frequencyList = new ArrayList(); 29 | 30 | ArrayList> frequencyResult = new ArrayList>(); 31 | //保存句子内容 32 | List result = new ArrayList(); 33 | Sentence item; 34 | //获取并保存每个句子 35 | for(int i = 0;i < sentenceList.size();i++){ 36 | 37 | ArrayList senList = new ArrayList(); 38 | 39 | senList.add(sentenceList.get(i)); 40 | 41 | frequencyResult.add(senList); 42 | 43 | item = new Sentence(); 44 | 45 | item.setContent(sentenceList.get(i)); 46 | 47 | result.add(item); 48 | 49 | } 50 | 51 | double frequency = 0.1; 52 | 53 | double[] senFrequency = new double[sentenceList.size()]; 54 | //针对每个句子计算权重 55 | for(int k = 0; k < sentenceList.size();k++){ 56 | 57 | String[] word = SegWord.segmentByJieba(sentenceList.get(k)); 58 | 59 | ArrayList wordlist = StopWords.cutStopWords(word); 60 | 61 | String[] wordclean = new String[wordlist.size()]; 62 | 63 | double frequencySen = 0.0; 64 | //获取句子的权重和 65 | for(int j = 0;j < wordlist.size();j++){ 66 | 67 | for(int m = 0;m < WordEntropy.length;m++){ 68 | 69 | String[] worden = WordEntropy[m].split(":"); 70 | 71 | if(wordlist.get(j).equals(worden[0])){ 72 | 73 | frequencySen = frequencySen + Double.valueOf(worden[1]); 74 | 75 | } 76 | 77 | } 78 | 79 | } 80 | 81 | senFrequency[k] = frequencySen; 82 | //比较得到所有句子中最大的权重 83 | if(frequency <= frequencySen){ 84 | 85 | frequency = frequencySen; 86 | 87 | } 88 | 89 | } 90 | //计算每个句子的相对词频,并保存 91 | for(int i = 0;i < frequencyResult.size();i++){ 92 | 93 | double senRelativeFre = (double)senFrequency[i]/frequency; 94 | 95 | frequencyResult.get(i).add(String.valueOf(senRelativeFre)); 96 | 97 | 98 | } 99 | return frequencyResult; 100 | 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /com/ext/ner/ExtractNer.java: -------------------------------------------------------------------------------- 1 | package com.ext.ner; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | 6 | import org.wltea.analyzer.lucene.IKAnalyzer; 7 | 8 | import com.ext.seg.word.SegWord; 9 | 10 | import edu.stanford.nlp.ie.AbstractSequenceClassifier; 11 | import edu.stanford.nlp.ie.crf.CRFClassifier; 12 | import edu.stanford.nlp.ling.CoreLabel; 13 | 14 | /* 15 | 16 | 加载NER模块 17 | 18 | */ 19 | 20 | public class ExtractNer 21 | 22 | { 23 | 24 | private static AbstractSequenceClassifier ner; 25 | 26 | public ExtractNer() 27 | 28 | { 29 | 30 | InitNer(); 31 | 32 | } 33 | 34 | public void InitNer() 35 | 36 | { 37 | 38 | String serializedClassifier = "ner/chinese.misc.distsim.crf.ser.gz";// chinese.misc.distsim.crf.ser.gz 39 | 40 | if (ner == null) 41 | 42 | { 43 | 44 | ner = CRFClassifier.getClassifierNoExceptions(serializedClassifier); 45 | 46 | } 47 | 48 | } 49 | 50 | public String doNer(String sent) 51 | 52 | { 53 | 54 | return ner.classifyWithInlineXML(sent); 55 | 56 | } 57 | 58 | public ArrayList cutWord(String sentence) throws Exception { 59 | 60 | ArrayList words = new ArrayList(); 61 | 62 | IKAnalyzer analyzer = new IKAnalyzer(true); 63 | 64 | words = analyzer.split(sentence); 65 | 66 | return words; 67 | } 68 | 69 | public static ArrayList getNerResultByJieba(String sentence) { 70 | 71 | ArrayList nerResult = new ArrayList(); 72 | 73 | String[] word = SegWord.segmentByJieba(sentence); 74 | 75 | ExtractNer extractNer = new ExtractNer(); 76 | 77 | for (int i = 0; i < word.length; i++) { 78 | 79 | nerResult.add(extractNer.doNer(word[i])); 80 | 81 | } 82 | 83 | return nerResult; 84 | } 85 | 86 | public ArrayList getNerResultByIk(String sentence) throws IOException { 87 | 88 | ArrayList nerResult = new ArrayList(); 89 | 90 | ArrayList words = new ArrayList(); 91 | 92 | IKAnalyzer analyzer = new IKAnalyzer(true); 93 | 94 | words = analyzer.split(sentence); 95 | 96 | ExtractNer extractNer = new ExtractNer(); 97 | 98 | for (int i = 0; i < words.size(); i++) { 99 | 100 | nerResult.add(extractNer.doNer(words.get(i))); 101 | 102 | } 103 | 104 | return nerResult; 105 | } 106 | 107 | public static void main(String args[]) 108 | 109 | { 110 | 111 | String str = "去年开始,打开百度李毅吧,满屏的帖子大多含有“屌丝”二字,一般网友不仅不懂这词什么意思,更难理解这个词为什么会这么火。"; 112 | 113 | ArrayList words = new ArrayList(); 114 | 115 | String[] word = SegWord.segmentByJieba(str); 116 | 117 | ExtractNer extractNer = new ExtractNer(); 118 | 119 | for (int i = 0; i < word.length; i++) { 120 | 121 | String strs = extractNer.doNer(word[i]); 122 | 123 | int begin = strs.indexOf("<"); 124 | 125 | int end = strs.indexOf(">"); 126 | 127 | System.out.println(extractNer.doNer(word[i])); 128 | 129 | } 130 | 131 | System.out.println("Complete!"); 132 | 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /com/ext/ner/SegWordNer.java: -------------------------------------------------------------------------------- 1 | package com.ext.ner; 2 | 3 | import java.io.File; 4 | 5 | import java.io.IOException; 6 | 7 | import java.util.Properties; 8 | 9 | import org.apache.commons.io.FileUtils; 10 | 11 | import edu.stanford.nlp.ie.crf.CRFClassifier; 12 | 13 | import edu.stanford.nlp.ling.CoreLabel; 14 | 15 | /* 16 | 17 | * Description 使用StanfordCoreNLP进行中文实体识别 18 | 19 | */ 20 | 21 | public class SegWordNer { 22 | 23 | public static CRFClassifier segmenter; 24 | 25 | static { 26 | 27 | // 设置一些初始化参数 28 | 29 | Properties props = new Properties(); 30 | 31 | props.setProperty("sighanCorporaDict", "data"); 32 | 33 | props.setProperty("serDictionary", "data/dict-chris6.ser.gz"); 34 | 35 | props.setProperty("inputEncoding", "UTF-8"); 36 | 37 | props.setProperty("sighanPostProcessing", "true"); 38 | 39 | segmenter = new CRFClassifier < CoreLabel > (props); 40 | 41 | segmenter.loadClassifierNoExceptions("data/ctb.gz", props); 42 | 43 | segmenter.flags.setProperties(props); 44 | 45 | } 46 | 47 | public static String doSegment(String sent) { 48 | 49 | String[] strs = (String[]) segmenter.segmentString(sent).toArray(); 50 | 51 | StringBuffer buf = new StringBuffer(); 52 | 53 | for (String s : strs) { 54 | 55 | buf.append(s + " "); 56 | 57 | } 58 | 59 | System.out.println("segmentedres: " + buf.toString()); 60 | 61 | return buf.toString(); 62 | 63 | } 64 | 65 | public static void main(String[] args) { 66 | 67 | try { 68 | 69 | String readFileToString = FileUtils 70 | .readFileToString(new File("file/test.txt")); 71 | 72 | String doSegment = doSegment(readFileToString); 73 | 74 | System.out.println(doSegment); 75 | 76 | ExtractNer extractNer = new ExtractNer(); 77 | 78 | System.out.println(extractNer.doNer(doSegment)); 79 | 80 | System.out.println("Complete!"); 81 | 82 | } catch (IOException e) { 83 | 84 | e.printStackTrace(); 85 | 86 | } 87 | 88 | } 89 | 90 | } 91 | -------------------------------------------------------------------------------- /com/ext/predeal/CutTextIntoSentence.java: -------------------------------------------------------------------------------- 1 | package com.ext.predeal; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.text.BreakIterator; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.Locale; 10 | import java.util.StringTokenizer; 11 | 12 | import com.ext.tfidf.WordWeight; 13 | 14 | public class CutTextIntoSentence { 15 | 16 | public static void main(String[] args) { 17 | 18 | String s = "拿到手机用了一下,首先说说优点:原生的android系统跟国内定制过的系统可以说是天壤之别!十分流畅,毫不弱于ios,其次moto的手机手感也不错!\n当然这款手机也有一些缺点,比如使用的习惯跟其他国产手机有些区别。"; 19 | // System.out.println(s); 20 | String filepath = "D:\\ExtTopic\\file\\test.txt"; 21 | String reviewContent = null; 22 | try { 23 | reviewContent = WordWeight.readFile(filepath); 24 | } catch (FileNotFoundException e) { 25 | // TODO Auto-generated catch block 26 | e.printStackTrace(); 27 | } catch (IOException e) { 28 | // TODO Auto-generated catch block 29 | e.printStackTrace(); 30 | } 31 | ArrayList test = cutTextIntoSentences(reviewContent, 0); 32 | for (String c : test) { 33 | System.out.println(c); 34 | } 35 | 36 | } 37 | 38 | public static ArrayList cutTextIntoSentences(String reviewContent, 39 | int tag) { 40 | ArrayList clauses = new ArrayList(); 41 | // tag=0,只分句;tag=1,记录句子在文章中是第几句;tag=2,记录段落位置等信息 42 | if (tag == 0) { 43 | // 去掉回车符 44 | reviewContent = reviewContent.replaceAll("\r\n", ""); 45 | reviewContent = reviewContent.replaceAll("\n", ""); 46 | 47 | ArrayList SubSentences = new ArrayList(); 48 | BreakIterator boundary = BreakIterator 49 | .getSentenceInstance(Locale.CHINESE); 50 | boundary.setText(reviewContent); 51 | int start = boundary.first(); 52 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary 53 | .next()) { 54 | SubSentences.add(reviewContent.substring(start, end)); 55 | } 56 | for (String sent : SubSentences) { 57 | clauses.add(sent.trim()); 58 | } 59 | } else if (tag == 1) { 60 | // 去掉回车符 61 | reviewContent = reviewContent.replaceAll(":", ""); 62 | reviewContent = reviewContent.replaceAll("\r\n", ""); 63 | reviewContent = reviewContent.replaceAll("\n", ""); 64 | ArrayList SubSentences = new ArrayList(); 65 | BreakIterator boundary = BreakIterator 66 | .getSentenceInstance(Locale.CHINESE); 67 | boundary.setText(reviewContent); 68 | int start = boundary.first(); 69 | // 记录句子在文章中是第几句 70 | int idx = 1; 71 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary 72 | .next()) { 73 | SubSentences.add(reviewContent.substring(start, end) + ":" 74 | + idx); 75 | idx++; 76 | } 77 | for (String sent : SubSentences) { 78 | clauses.add(sent.trim()); 79 | } 80 | } else if (tag == 2) { 81 | String[] text = reviewContent.split("\t"); 82 | 83 | ArrayList textList = new ArrayList(); 84 | for (int i = 0; i < text.length; i++) { 85 | if (text[i].equals("")) { 86 | continue; 87 | } 88 | textList.add(text[i]); 89 | } 90 | 91 | // 记录句子在整篇文章中是第几句 92 | int j = 1; 93 | for (int i = 0; i < textList.size(); i++) { 94 | // 记录句子在该段落中是第几句 95 | int k = 1; 96 | ArrayList SubSentencesIdx = new ArrayList(); 97 | BreakIterator boundary = BreakIterator 98 | .getSentenceInstance(Locale.CHINESE); 99 | boundary.setText(textList.get(i)); 100 | int start = boundary.first(); 101 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary 102 | .next()) { 103 | SubSentencesIdx.add(textList.get(i).substring(start, end) 104 | + ":" + i + ":" + j + ":" + k); 105 | k++; 106 | j++; 107 | } 108 | for (String sent : SubSentencesIdx) { 109 | clauses.add(sent.trim()); 110 | } 111 | } 112 | } 113 | 114 | return clauses; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /com/ext/predeal/CutTextIntoSentences.java: -------------------------------------------------------------------------------- 1 | package com.ext.predeal; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.text.BreakIterator; 7 | import java.util.ArrayList; 8 | import java.util.HashMap; 9 | import java.util.Locale; 10 | import java.util.StringTokenizer; 11 | 12 | import com.ext.tfidf.WordWeight; 13 | 14 | public class CutTextIntoSentences { 15 | 16 | public static void main(String[] args) { 17 | 18 | String s = "拿到手机用了一下,首先说说优点:原生的android系统跟国内定制过的系统可以说是天壤之别!十分流畅,毫不弱于ios,其次moto的手机手感也不错!\n当然这款手机也有一些缺点,比如使用的习惯跟其他国产手机有些区别。"; 19 | //System.out.println(s); 20 | String filepath = "D:\\ExtTopic\\file\\test.txt"; 21 | String reviewContent = null; 22 | try { 23 | reviewContent = WordWeight.readFile(filepath); 24 | } catch (FileNotFoundException e) { 25 | // TODO Auto-generated catch block 26 | e.printStackTrace(); 27 | } catch (IOException e) { 28 | // TODO Auto-generated catch block 29 | e.printStackTrace(); 30 | } 31 | ArrayList test = cutTextIntoSentences(reviewContent); 32 | for (String c : test) { 33 | System.out.println(c); 34 | } 35 | 36 | } 37 | 38 | public static ArrayList cutTextIntoSentences(String reviewContent) { 39 | // 去掉回车符 40 | reviewContent = reviewContent.replaceAll("\r\n", ""); 41 | reviewContent = reviewContent.replaceAll("\n", ""); 42 | 43 | ArrayList SubSentences = new ArrayList(); 44 | BreakIterator boundary = BreakIterator 45 | .getSentenceInstance(Locale.CHINESE); 46 | boundary.setText(reviewContent); 47 | int start = boundary.first(); 48 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary 49 | .next()) { 50 | SubSentences.add(reviewContent.substring(start, end)); 51 | } 52 | 53 | ArrayList clauses = new ArrayList(); 54 | 55 | // 如果句子太长,进一步分句;将来可考虑全都以逗号分句 56 | for (String sent : SubSentences) { 57 | if (sent.length() >= 300) { 58 | StringTokenizer token_comma = new StringTokenizer(sent, ",,;", 59 | false); 60 | if (token_comma.countTokens() > 1) { 61 | while (token_comma.hasMoreTokens()) { 62 | String clause = token_comma.nextToken().trim(); 63 | if (clause.length() > 0) { 64 | clauses.add(clause); 65 | } 66 | } 67 | } 68 | 69 | } else { 70 | clauses.add(sent.trim()); 71 | } 72 | } 73 | 74 | return clauses; 75 | } 76 | 77 | //获取句子在文章中的位置 78 | public static ArrayList cutTextIntoSentencesGetIdx(String reviewContent) { 79 | // 去掉回车符 80 | reviewContent = reviewContent.replaceAll(":", ""); 81 | reviewContent = reviewContent.replaceAll("\r\n", ""); 82 | reviewContent = reviewContent.replaceAll("\n", ""); 83 | 84 | ArrayList SubSentences = new ArrayList(); 85 | BreakIterator boundary = BreakIterator 86 | .getSentenceInstance(Locale.CHINESE); 87 | boundary.setText(reviewContent); 88 | int start = boundary.first(); 89 | //记录句子在文章中是第几句 90 | int idx = 1; 91 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary 92 | .next()) { 93 | SubSentences.add(reviewContent.substring(start, end) + ":" + idx); 94 | idx++; 95 | } 96 | 97 | ArrayList clauses = new ArrayList(); 98 | 99 | // 如果句子太长,进一步分句; 100 | for (String sent : SubSentences) { 101 | String[] senArray = sent.split(":"); 102 | String sentContent = senArray[0]; 103 | String senIdx = senArray[1]; 104 | int curIdx = Integer.parseInt(senIdx); 105 | int senidx = Integer.parseInt(senIdx); 106 | if (sentContent.length() >= 300) { 107 | StringTokenizer token_comma = new StringTokenizer(sentContent, ",,;", 108 | false); 109 | if (token_comma.countTokens() > 1) { 110 | while (token_comma.hasMoreTokens()) { 111 | String clause = token_comma.nextToken().trim(); 112 | if (clause.length() > 0) { 113 | clauses.add(clause + ":" + senidx); 114 | senidx++; 115 | } 116 | } 117 | } 118 | 119 | } else { 120 | clauses.add(sent.trim()); 121 | } 122 | } 123 | 124 | return clauses; 125 | } 126 | 127 | /* 128 | * 把句子按行分段,标记句子在篇章中的位置 129 | */ 130 | public static ArrayList cutTextIntoSentencesIdx(String reviewContent) { 131 | // 去掉特殊符号 132 | 133 | //reviewContent = reviewContent.replaceAll("", ""); 134 | //reviewContent = reviewContent.replaceAll("\r", ""); 135 | reviewContent = reviewContent.replaceAll(":", ""); 136 | reviewContent = reviewContent.replaceAll("\t", ""); 137 | reviewContent = reviewContent.replaceAll("\r\n", "\t"); 138 | reviewContent = reviewContent.replaceAll("\n", "\t"); 139 | 140 | String[] text = reviewContent.split("\t"); 141 | 142 | ArrayList textList = new ArrayList(); 143 | for(int i = 0 ;i < text.length;i++){ 144 | if(text[i].equals("")){ 145 | continue; 146 | } 147 | textList.add(text[i]); 148 | } 149 | 150 | // 保存句子在段落中的位置信息 151 | ArrayList clausesIdx = new ArrayList(); 152 | //记录句子在整篇文章中是第几句 153 | int j = 1; 154 | for (int i = 0; i < textList.size(); i++) { 155 | //记录句子在该段落中是第几句 156 | int k = 1; 157 | ArrayList SubSentencesIdx = new ArrayList(); 158 | BreakIterator boundary = BreakIterator 159 | .getSentenceInstance(Locale.CHINESE); 160 | boundary.setText(textList.get(i)); 161 | int start = boundary.first(); 162 | for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary 163 | .next()) { 164 | SubSentencesIdx.add(textList.get(i).substring(start, end) + ":" + i 165 | + ":" + j + ":" + k); 166 | k++; 167 | j++; 168 | } 169 | 170 | // 长句分句后该句内的每个句子的位置发生变化,但长句后的句子在段落中的位置不发生变化 171 | for (String sent : SubSentencesIdx) { 172 | String[] sentenceArray = sent.split(":"); 173 | String sentenceText = sentenceArray[0]; 174 | String paragraphIdx = sentenceArray[1]; 175 | String idx = sentenceArray[2]; 176 | String idx2 = sentenceArray[3]; 177 | if (sentenceText.length() >= 300) { 178 | StringTokenizer token_comma = new StringTokenizer( 179 | sentenceText, ",,;", false); 180 | int Idx = Integer.parseInt(idx); 181 | int Idx2 = Integer.parseInt(idx2); 182 | if (token_comma.countTokens() > 1) { 183 | while (token_comma.hasMoreTokens()) { 184 | String clause = token_comma.nextToken().trim() 185 | + ":" + paragraphIdx + ":" + Idx + ":" + Idx2; 186 | if (clause.length() > 0) { 187 | clausesIdx.add(clause); 188 | Idx++; 189 | Idx2++; 190 | } 191 | } 192 | } 193 | 194 | } else { 195 | clausesIdx.add(sent.trim()); 196 | } 197 | } 198 | 199 | } 200 | 201 | for (String str : clausesIdx) { 202 | System.out.println(str); 203 | } 204 | 205 | return clausesIdx; 206 | 207 | } 208 | 209 | } 210 | -------------------------------------------------------------------------------- /com/ext/seg/word/SegWord.java: -------------------------------------------------------------------------------- 1 | package com.ext.seg.word; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FileOutputStream; 8 | import java.io.FileWriter; 9 | import java.io.IOException; 10 | import java.io.InputStreamReader; 11 | import java.io.OutputStreamWriter; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | import java.util.TreeSet; 15 | 16 | import org.wltea.analyzer.lucene.IKAnalyzer; 17 | 18 | import com.ext.conf.Config; 19 | import com.ext.tfidf.WordWeight; 20 | import com.huaban.analysis.jieba.JiebaSegmenter; 21 | import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; 22 | import com.huaban.analysis.jieba.SegToken; 23 | import com.huaban.analysis.jieba.WordDictionary; 24 | 25 | public class SegWord { 26 | 27 | private static JiebaSegmenter segmenter = null; 28 | 29 | public static void init() { 30 | if (segmenter == null) { 31 | WordDictionary.getInstance().init(new File(Config.SEG_PATH)); 32 | segmenter = new JiebaSegmenter(); 33 | 34 | } 35 | } 36 | 37 | // public static ArrayList segment(String sentence) { 38 | // init(); 39 | // List words = segmenter.process(sentence, SegMode.SEARCH); 40 | // int iSize = words.size(); 41 | // ArrayList word = new ArrayList(); 42 | // for (int i = 0; i < iSize; i++) { 43 | // SegToken seg = words.get(i); 44 | // word.add(sentence.substring(seg.startOffset, seg.endOffset)); 45 | // } 46 | // return word; 47 | // } 48 | 49 | public static String[] segmentByJieba(String sentence) { 50 | init(); 51 | String senStr = sentence.trim(); 52 | List words = segmenter.process(senStr, SegMode.SEARCH); 53 | int iSize = words.size(); 54 | String[] word = new String[iSize]; 55 | for (int i = 0; i < iSize; i++) { 56 | SegToken seg = words.get(i); 57 | word[i] = sentence.substring(seg.startOffset, seg.endOffset); 58 | } 59 | 60 | return word; 61 | } 62 | 63 | public static ArrayList segmentByIk(String sentence) throws Exception { 64 | ArrayList words = new ArrayList(); 65 | IKAnalyzer analyzer = new IKAnalyzer(true); 66 | words = analyzer.split(sentence); 67 | return words; 68 | } 69 | 70 | private static String clearStr(String word) { 71 | String key = word; 72 | if (null != word && !"".equals(word.trim())) { 73 | key = word.trim().toLowerCase(); 74 | } 75 | return key; 76 | } 77 | 78 | public static void main(String args[]) throws Exception{ 79 | 80 | String str = "屌丝,一个字头的诞生,屌丝"; 81 | String[] word = segmentByJieba(str); 82 | ArrayList list = segmentByIk(str); 83 | for(int i=0;i FileList = new ArrayList(); // 获取所有文件 16 | 17 | // 获取当前目录及子目录下的所有文件 18 | public static List readDirs(String filepath) 19 | throws FileNotFoundException, IOException { 20 | try { 21 | File file = new File(filepath); 22 | if (!file.isDirectory()) { 23 | System.out.println("输入的[]"); 24 | System.out.println("filepath:" + file.getAbsolutePath()); 25 | } else { 26 | String[] flist = file.list(); 27 | for (int i = 0; i < flist.length; i++) { 28 | File newfile = new File(filepath + "\\" + flist[i]); 29 | if (!newfile.isDirectory()) { 30 | FileList.add(newfile.getAbsolutePath()); 31 | } else if (newfile.isDirectory()) // if file is a directory, 32 | // call ReadDirs 33 | { 34 | readDirs(filepath + "\\" + flist[i]); 35 | } 36 | } 37 | } 38 | } catch (FileNotFoundException e) { 39 | System.out.println(e.getMessage()); 40 | } 41 | return FileList; 42 | } 43 | 44 | // 读取文件内容 45 | public static String readFile(String file) throws FileNotFoundException, 46 | IOException { 47 | StringBuffer strSb = new StringBuffer(); // String is constant, 48 | // StringBuffer can be 49 | // changed. 50 | InputStreamReader inStrR = new InputStreamReader(new FileInputStream( 51 | file), "utf8"); // byte streams to character streams 52 | BufferedReader br = new BufferedReader(inStrR); 53 | String line = br.readLine(); 54 | while (line != null) { 55 | strSb.append(line).append("\r\n"); 56 | line = br.readLine(); 57 | } 58 | 59 | return strSb.toString(); 60 | } 61 | 62 | // 分词 63 | public static ArrayList cutWords(String file) throws IOException { 64 | 65 | ArrayList words = new ArrayList(); 66 | String text = WordWeight.readFile(file); 67 | IKAnalyzer analyzer = new IKAnalyzer(true); 68 | words = analyzer.split(text); 69 | for (int i = 0; i < words.size(); i++) { 70 | System.out.println(i + ":" + words.get(i)); 71 | } 72 | return words; 73 | } 74 | 75 | // 统计每篇文档中单词的个数 76 | public static HashMap normalTF(ArrayList cutwords) { 77 | HashMap resTF = new HashMap(); 78 | 79 | for (String word : cutwords) { 80 | if (resTF.get(word) == null) { 81 | resTF.put(word, 1); 82 | System.out.println(word); 83 | } else { 84 | resTF.put(word, resTF.get(word) + 1); 85 | System.out.println(word.toString()); 86 | } 87 | } 88 | return resTF; 89 | } 90 | 91 | // 计算每篇文档的词频 92 | public static HashMap tf(ArrayList cutwords) { 93 | HashMap resTF = new HashMap(); 94 | 95 | int wordLen = cutwords.size(); 96 | HashMap intTF = WordWeight.normalTF(cutwords); 97 | 98 | Iterator iter = intTF.entrySet().iterator(); // iterator for that get 99 | // from TF 100 | while (iter.hasNext()) { 101 | Map.Entry entry = (Map.Entry) iter.next(); 102 | resTF.put(entry.getKey().toString(), 103 | Float.parseFloat(entry.getValue().toString()) / wordLen); 104 | System.out.println(entry.getKey().toString() + " = " 105 | + Float.parseFloat(entry.getValue().toString()) / wordLen); 106 | } 107 | return resTF; 108 | } 109 | 110 | // 统计当前目录下所有文件中词出现的次数 111 | public static HashMap> normalTFAllFiles( 112 | String dirc) throws IOException { 113 | HashMap> allNormalTF = new HashMap>(); 114 | List filelist = WordWeight.readDirs(dirc); 115 | 116 | for (String file : filelist) { 117 | HashMap dict = new HashMap(); 118 | ArrayList cutwords = WordWeight.cutWords(file); // get cut 119 | // word for 120 | // one file 121 | 122 | dict = WordWeight.normalTF(cutwords); 123 | allNormalTF.put(file, dict); 124 | } 125 | return allNormalTF; 126 | } 127 | 128 | // 计算当前目录下所有文件中的词频 129 | public static HashMap> tfAllFiles(String dirc) 130 | throws IOException { 131 | HashMap> allTF = new HashMap>(); 132 | List filelist = WordWeight.readDirs(dirc); 133 | 134 | for (String file : filelist) { 135 | HashMap dict = new HashMap(); 136 | ArrayList cutwords = WordWeight.cutWords(file); // get cut 137 | // words for 138 | // one file 139 | 140 | dict = WordWeight.tf(cutwords); 141 | allTF.put(file, dict); 142 | } 143 | return allTF; 144 | } 145 | 146 | // 计算IDF 147 | public static HashMap idf( 148 | HashMap> all_tf) { 149 | HashMap resIdf = new HashMap(); 150 | HashMap dict = new HashMap(); 151 | int docNum = FileList.size(); 152 | 153 | for (int i = 0; i < docNum; i++) { 154 | HashMap temp = all_tf.get(FileList.get(i)); 155 | Iterator iter = temp.entrySet().iterator(); 156 | while (iter.hasNext()) { 157 | Map.Entry entry = (Map.Entry) iter.next(); 158 | String word = entry.getKey().toString(); 159 | if (dict.get(word) == null) { 160 | dict.put(word, 1); 161 | } else { 162 | dict.put(word, dict.get(word) + 1); 163 | } 164 | } 165 | } 166 | System.out.println("IDF for every word is:"); 167 | Iterator iter_dict = dict.entrySet().iterator(); 168 | while (iter_dict.hasNext()) { 169 | Map.Entry entry = (Map.Entry) iter_dict.next(); 170 | float value = (float) Math.log(docNum 171 | / Float.parseFloat(entry.getValue().toString())); 172 | resIdf.put(entry.getKey().toString(), value); 173 | System.out.println(entry.getKey().toString() + " = " + value); 174 | } 175 | return resIdf; 176 | } 177 | 178 | // 计算TF-IDF 179 | public static void tf_idf(HashMap> all_tf, 180 | HashMap idfs) { 181 | HashMap> resTfIdf = new HashMap>(); 182 | 183 | int docNum = FileList.size(); 184 | for (int i = 0; i < docNum; i++) { 185 | String filepath = FileList.get(i); 186 | HashMap tfidf = new HashMap(); 187 | HashMap temp = all_tf.get(filepath); 188 | Iterator iter = temp.entrySet().iterator(); 189 | while (iter.hasNext()) { 190 | Map.Entry entry = (Map.Entry) iter.next(); 191 | String word = entry.getKey().toString(); 192 | Float value = (float) Float.parseFloat(entry.getValue() 193 | .toString()) * idfs.get(word); 194 | tfidf.put(word, value); 195 | } 196 | resTfIdf.put(filepath, tfidf); 197 | } 198 | System.out.println("TF-IDF for Every file is :"); 199 | DisTfIdf(resTfIdf); 200 | } 201 | 202 | public static void DisTfIdf(HashMap> tfidf) { 203 | Iterator iter1 = tfidf.entrySet().iterator(); 204 | while (iter1.hasNext()) { 205 | Map.Entry entrys = (Map.Entry) iter1.next(); 206 | System.out.println("FileName: " + entrys.getKey().toString()); 207 | System.out.print("{"); 208 | HashMap temp = (HashMap) entrys 209 | .getValue(); 210 | Iterator iter2 = temp.entrySet().iterator(); 211 | while (iter2.hasNext()) { 212 | Map.Entry entry = (Map.Entry) iter2.next(); 213 | System.out.print(entry.getKey().toString() + " = " 214 | + entry.getValue().toString() + ", "); 215 | } 216 | System.out.println("}"); 217 | } 218 | 219 | } 220 | 221 | public static void main(String[] args) throws IOException { 222 | // TODO Auto-generated method stub 223 | String file = "D:\\NLPIR\\NLPIR2015\\test\\test"; 224 | 225 | HashMap> all_tf = tfAllFiles(file); 226 | System.out.println(); 227 | HashMap idfs = idf(all_tf); 228 | System.out.println(); 229 | tf_idf(all_tf, idfs); 230 | 231 | } 232 | 233 | } 234 | -------------------------------------------------------------------------------- /com/ext/topic/utils/ReadConfigUtil.java: -------------------------------------------------------------------------------- 1 | package com.ext.topic.utils; 2 | 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.FileNotFoundException; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.util.Properties; 9 | 10 | public class ReadConfigUtil { 11 | private static Properties config = null; 12 | static { 13 | InputStream in = null; 14 | try { 15 | File config_file_path = new File("nlpir.properties"); 16 | if (config_file_path.exists()) { 17 | in = new FileInputStream(config_file_path); 18 | } else { 19 | in = ReadConfigUtil.class.getClassLoader().getResourceAsStream( 20 | "nlpir.properties"); 21 | } 22 | } catch (FileNotFoundException e1) { 23 | System.out.println("config file not found nlpir.properties!"); 24 | } 25 | config = new Properties(); 26 | try { 27 | if(in==null){ 28 | System.out.println("config file not found nlpir.properties!"); 29 | }else { 30 | config.load(in); 31 | in.close(); 32 | } 33 | } catch (IOException e) { 34 | System.out.println("No nlpir.properties defined error"); 35 | } 36 | } 37 | 38 | // 根据key读取value 39 | public static String getValue(String key) { 40 | // Properties props = new Properties(); 41 | try { 42 | String value = config.getProperty(key); 43 | return value; 44 | } catch (Exception e) { 45 | e.printStackTrace(); 46 | System.err.println("ConfigInfoError" + e.toString()); 47 | return null; 48 | } 49 | } 50 | 51 | public static void main(String args[]) { 52 | System.out.println(getValue("dll_or_so_path")); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /com/ext/topic/utils/SystemParas.java: -------------------------------------------------------------------------------- 1 | package com.ext.topic.utils; 2 | 3 | /** 4 | * 系统配置参数加载类 5 | * 6 | * @author kernal 7 | * 8 | */ 9 | public class SystemParas { 10 | public static String dll_or_so_lib_path = ReadConfigUtil.getValue("dll_or_so_path"); 11 | } 12 | -------------------------------------------------------------------------------- /com/ext/topic/word/ExtractTopicWord.java: -------------------------------------------------------------------------------- 1 | package com.ext.topic.word; 2 | 3 | import java.io.UnsupportedEncodingException; 4 | import java.util.ArrayList; 5 | import java.util.HashSet; 6 | 7 | import com.ext.topic.utils.SystemParas; 8 | import com.sun.jna.Library; 9 | import com.sun.jna.Native; 10 | 11 | public class ExtractTopicWord { 12 | 13 | // 定义接口CLibrary,继承自com.sun.jna.Library 14 | public interface CLibrary extends Library { 15 | // 定义并初始化接口的静态变量 16 | CLibrary Instance = (CLibrary) Native.loadLibrary( 17 | "D:\\NLPIR\\bin\\NLPIR", CLibrary.class); 18 | 19 | public int NLPIR_Init(String sDataPath, int encoding, 20 | String sLicenceCode); 21 | 22 | public String NLPIR_GetKeyWords(String sLine, int nMaxKeyLimit, 23 | boolean bWeightOut); 24 | 25 | public String NLPIR_GetFileKeyWords(String sLine, int nMaxKeyLimit, 26 | boolean bWeightOut); 27 | 28 | public int NLPIR_AddUserWord(String sWord);// add by qp 2008.11.10 29 | 30 | public int NLPIR_DelUsrWord(String sWord);// add by qp 2008.11.10 31 | 32 | public String NLPIR_GetLastErrorMsg(); 33 | 34 | public void NLPIR_Exit(); 35 | } 36 | 37 | public static String transString(String aidString, String ori_encoding, 38 | String new_encoding) { 39 | try { 40 | return new String(aidString.getBytes(ori_encoding), new_encoding); 41 | } catch (UnsupportedEncodingException e) { 42 | e.printStackTrace(); 43 | } 44 | return null; 45 | } 46 | 47 | public static HashSet getKeyWords(String fileContent, ArrayList lableList) throws Exception { 48 | String argu = "D:\\NLPIR\\NLPIR2016\\20140928"; 49 | // String system_charset = "GBK";//GBK----0 50 | String system_charset = "UTF-8"; 51 | int charset_type = 1; 52 | String[] keywords = null; 53 | int init_flag = CLibrary.Instance.NLPIR_Init(argu, charset_type, "0"); 54 | String nativeBytes = null; 55 | String nativeByte = null; 56 | if (0 == init_flag) { 57 | nativeBytes = CLibrary.Instance.NLPIR_GetLastErrorMsg(); 58 | System.err.println("初始化失败!fail reason is " + nativeBytes); 59 | return null; 60 | } 61 | 62 | try { 63 | 64 | nativeByte = CLibrary.Instance.NLPIR_GetKeyWords(fileContent, 20, 65 | false); 66 | 67 | System.out.print("关键词提取结果是:" + nativeByte); 68 | 69 | keywords = nativeByte.split("#"); 70 | 71 | CLibrary.Instance.NLPIR_Exit(); 72 | 73 | } catch (Exception ex) { 74 | // TODO Auto-generated catch block 75 | ex.printStackTrace(); 76 | } 77 | 78 | HashSet lableSet = new HashSet(); 79 | 80 | if(lableList != null){ 81 | 82 | //添加标签作为关键词集的一部分 83 | for(int i = 0;i < lableList.size();i++){ 84 | 85 | lableSet.add(lableList.get(i)); 86 | 87 | } 88 | 89 | } 90 | 91 | for(int j = 0;j < keywords.length;j++){ 92 | 93 | if(keywords[j].equals("Line 0")){ 94 | 95 | continue; 96 | 97 | } 98 | 99 | lableSet.add(keywords[j]); 100 | 101 | } 102 | 103 | return lableSet; 104 | 105 | } 106 | 107 | public static void main(String[] args) throws Exception { 108 | 109 | String filepath = "两人打甲流疫苗后死亡 另有15例较严重异常反应"; 110 | 111 | ArrayList lableList = new ArrayList(); 112 | 113 | lableList.add("疫苗"); 114 | 115 | lableList.add("北京"); 116 | 117 | HashSet result = getKeyWords(filepath, lableList); 118 | 119 | for(String str : result){ 120 | System.out.println(str); 121 | } 122 | 123 | } 124 | } 125 | 126 | -------------------------------------------------------------------------------- /com/ext/topic/word/NlpirTest.java: -------------------------------------------------------------------------------- 1 | package com.ext.topic.word; 2 | 3 | import java.io.UnsupportedEncodingException; 4 | 5 | import com.ext.topic.utils.SystemParas; 6 | 7 | import com.sun.jna.Library; 8 | import com.sun.jna.Native; 9 | 10 | public class NlpirTest { 11 | 12 | // 定义接口CLibrary,继承自com.sun.jna.Library 13 | public interface CLibrary extends Library { 14 | // 定义并初始化接口的静态变量 15 | CLibrary Instance = (CLibrary) Native.loadLibrary( 16 | "D:\\NLPIR\\bin\\NLPIR", CLibrary.class); 17 | 18 | public int NLPIR_Init(String sDataPath, int encoding, 19 | String sLicenceCode); 20 | 21 | public String NLPIR_ParagraphProcess(String sSrc, int bPOSTagged); 22 | 23 | public String NLPIR_GetKeyWords(String sLine, int nMaxKeyLimit, 24 | boolean bWeightOut); 25 | 26 | public String NLPIR_GetFileKeyWords(String sLine, int nMaxKeyLimit, 27 | boolean bWeightOut); 28 | 29 | public int NLPIR_AddUserWord(String sWord);// add by qp 2008.11.10 30 | 31 | public int NLPIR_DelUsrWord(String sWord);// add by qp 2008.11.10 32 | 33 | public String NLPIR_GetLastErrorMsg(); 34 | 35 | public void NLPIR_Exit(); 36 | } 37 | 38 | public static String transString(String aidString, String ori_encoding, 39 | String new_encoding) { 40 | try { 41 | return new String(aidString.getBytes(ori_encoding), new_encoding); 42 | } catch (UnsupportedEncodingException e) { 43 | e.printStackTrace(); 44 | } 45 | return null; 46 | } 47 | 48 | public static String getKeyWords(String filepath) throws Exception { 49 | String argu = "D:\\NLPIR"; 50 | // String system_charset = "GBK";//GBK----0 51 | String system_charset = "UTF-8"; 52 | int charset_type = 1; 53 | 54 | int init_flag = CLibrary.Instance.NLPIR_Init(argu, charset_type, "0"); 55 | String nativeBytes = null; 56 | String nativeByte = null; 57 | if (0 == init_flag) { 58 | nativeBytes = CLibrary.Instance.NLPIR_GetLastErrorMsg(); 59 | System.err.println("初始化失败!fail reason is " + nativeBytes); 60 | return null; 61 | } 62 | 63 | String sInput = "据悉,质检总局已将最新有关情况再次通报美方,要求美方加强对输华玉米的产地来源、运输及仓储等环节的管控措施,有效避免输华玉米被未经我国农业部安全评估并批准的转基因品系污染。"; 64 | 65 | // String nativeBytes = null; 66 | try { 67 | nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(sInput, 1); 68 | 69 | System.out.println("分词结果为: " + nativeBytes); 70 | 71 | // CLibrary.Instance.NLPIR_AddUserWord("要求美方加强对输 n"); 72 | // CLibrary.Instance.NLPIR_AddUserWord("华玉米的产地来源 n"); 73 | // nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(sInput, 74 | // 1); 75 | // System.out.println("增加用户词典后分词结果为: " + nativeBytes); 76 | // 77 | // CLibrary.Instance.NLPIR_DelUsrWord("要求美方加强对输"); 78 | // nativeBytes = CLibrary.Instance.NLPIR_ParagraphProcess(sInput, 79 | // 1); 80 | // System.out.println("删除用户词典后分词结果为: " + nativeBytes); 81 | 82 | int nCountKey = 0; 83 | nativeByte = CLibrary.Instance.NLPIR_GetKeyWords(sInput, 84 | nativeBytes.length(), true); 85 | 86 | System.out.print("关键词提取结果是:" + nativeByte); 87 | 88 | nativeByte = CLibrary.Instance.NLPIR_GetFileKeyWords(filepath, 10, 89 | true); 90 | 91 | System.out.print("关键词提取结果是:" + nativeByte); 92 | 93 | CLibrary.Instance.NLPIR_Exit(); 94 | 95 | } catch (Exception ex) { 96 | // TODO Auto-generated catch block 97 | ex.printStackTrace(); 98 | } 99 | return nativeByte; 100 | 101 | } 102 | 103 | public static void main(String[] args) throws Exception { 104 | 105 | String filepath = "D:\\NLPIR\\NLPIR2015\\test\\屌丝,一个字头的诞生.TXT"; 106 | 107 | getKeyWords(filepath); 108 | 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /com/ext/word/weight/GetAllEntropy.java: -------------------------------------------------------------------------------- 1 | package com.ext.word.weight; 2 | 3 | import java.util.ArrayList; 4 | 5 | import com.ext.conf.StopWords; 6 | import com.ext.predeal.CutTextIntoSentence; 7 | import com.ext.seg.word.SegWord; 8 | 9 | public class GetAllEntropy { 10 | 11 | public static String[] getAllEntropy(String reviewContent) { 12 | 13 | ArrayList sentenceList = CutTextIntoSentence 14 | .cutTextIntoSentences(reviewContent,0); 15 | 16 | ArrayList words = new ArrayList(); 17 | 18 | for (int i = 0; i < sentenceList.size(); i++) { 19 | 20 | String[] word = SegWord.segmentByJieba(sentenceList.get(i)); 21 | 22 | for (int j = 0; j < word.length; j++) { 23 | 24 | words.add(word[j]); 25 | 26 | } 27 | 28 | } 29 | 30 | String[] wordStr = new String[words.size()]; 31 | 32 | for(int i = 0;i < words.size();i++){ 33 | 34 | wordStr[i] = words.get(i); 35 | //System.out.print(wordStr[i] + "\t"); 36 | 37 | } 38 | 39 | //System.out.println(); 40 | 41 | ArrayList wordClean = StopWords.cutStopWords(wordStr); 42 | 43 | String[] wordResult = new String[wordClean.size()]; 44 | 45 | for(int k = 0;k < wordClean.size();k++){ 46 | 47 | wordResult[k] = wordClean.get(k); 48 | //System.out.print(wordResult[k] + "\t"); 49 | 50 | } 51 | 52 | // 获取该篇文档每个词的权重 53 | String[] WordEntropy = GetWordEntropy.CalculateWordEntropy(wordResult); 54 | 55 | return WordEntropy; 56 | 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /com/ext/word/weight/GetWordEntropy.java: -------------------------------------------------------------------------------- 1 | package com.ext.word.weight; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.HashSet; 6 | import java.util.List; 7 | import java.util.TreeSet; 8 | 9 | import com.ext.seg.word.SegWord; 10 | import com.ext.tfidf.WordWeight; 11 | 12 | /** 13 | * 计算句子中每个词的信息熵(还可通过加命名实体等增加权重计算步骤) 14 | * 15 | * @author Administrator 16 | * 17 | */ 18 | public class GetWordEntropy { 19 | 20 | public static void main(String[] args) throws Exception { 21 | // TODO Auto-generated method stub 22 | 23 | String sentence = "拿到手机用了一下,首先说说优点:原生的android系统跟国内定制过的系统可以说是天壤之别,十分流畅,毫不弱于ios,其次moto的手机手感也不错;当然这款手机也有一些缺点,比如使用的习惯跟其他国产手机有些区别。"; 24 | 25 | String file = "E:\\testfile\\test.txt"; 26 | 27 | //String[] words = SegWord.segmentByJieba(sentence); 28 | 29 | //ArrayList listWords = SegWord.segmentByIk(sentence); 30 | 31 | ArrayList listWords2 = WordWeight.cutWords(file); 32 | 33 | String[] words2 = new String[listWords2.size()]; 34 | 35 | for(int i = 0; i < listWords2.size(); i++){ 36 | 37 | words2[i] = listWords2.get(i); 38 | 39 | } 40 | 41 | CalculateWordEntropy(words2); 42 | 43 | } 44 | 45 | public static String[] CalculateWordEntropy(String[] words) { 46 | 47 | int length = words.length; 48 | 49 | ArrayList wordList = new ArrayList(); 50 | // 将分好的词每3个一组存到数组中 51 | for (int i = 0; i < length; i++) { 52 | 53 | String[] wordSeg = new String[3]; 54 | if (i == 0) { 55 | wordSeg[0] = "null"; 56 | wordSeg[1] = words[i]; 57 | wordSeg[2] = words[i + 1]; 58 | } else if (i == length - 1) { 59 | wordSeg[0] = words[i - 1]; 60 | wordSeg[1] = words[i]; 61 | wordSeg[2] = "null"; 62 | } else { 63 | wordSeg[0] = words[i - 1]; 64 | wordSeg[1] = words[i]; 65 | wordSeg[2] = words[i + 1]; 66 | } 67 | 68 | wordList.add(wordSeg); 69 | 70 | } 71 | // 去除重复的词 72 | List lists = new ArrayList(); 73 | for (int l = 0; l < length; l++) { 74 | lists.add(words[l]); 75 | } 76 | List tempList = new ArrayList(); 77 | for (String str : lists) { 78 | if (!(tempList.contains(str))) { 79 | tempList.add(str); 80 | } 81 | } 82 | String[] wordClean = new String[tempList.size()]; 83 | for (int m = 0; m < tempList.size(); m++) { 84 | wordClean[m] = tempList.get(m); 85 | } 86 | // 统计每个词的词频 87 | int[] frequent = new int[wordClean.length]; 88 | for (int j = 0; j < wordClean.length; j++) { 89 | int count = 0; 90 | for (int k = 0; k < words.length; k++) { 91 | if (wordClean[j].equals(words[k])) { 92 | count++; 93 | } 94 | } 95 | frequent[j] = count; 96 | } 97 | // 将三元组中中间的那个词相同的存到一个list中,然后计算该词的信息熵 98 | double[] allEntropy = new double[wordClean.length]; 99 | for (int n = 0; n < wordClean.length; n++) { 100 | ArrayList wordSegList = new ArrayList(); 101 | int count = 0; 102 | for (int p = 0; p < wordList.size(); p++) { 103 | String[] wordSegStr = wordList.get(p); 104 | if (wordSegStr[1].equals(wordClean[n])) { 105 | count++; 106 | wordSegList.add(wordSegStr); 107 | } 108 | } 109 | String[] leftword = new String[wordSegList.size()]; 110 | String[] rightword = new String[wordSegList.size()]; 111 | // 计算左信息熵 112 | for (int i = 0; i < wordSegList.size(); i++) { 113 | String[] left = wordSegList.get(i); 114 | leftword[i] = left[0]; 115 | } 116 | // 去除左边重复的词 117 | List listsLeft = new ArrayList(); 118 | for (int l = 0; l < leftword.length; l++) { 119 | listsLeft.add(leftword[l]); 120 | } 121 | List tempListLeft = new ArrayList(); 122 | for (String str : listsLeft) { 123 | if (!(tempListLeft.contains(str))) { 124 | tempListLeft.add(str); 125 | } 126 | } 127 | String[] leftWordClean = new String[tempListLeft.size()]; 128 | for (int m = 0; m < tempListLeft.size(); m++) { 129 | leftWordClean[m] = tempListLeft.get(m); 130 | } 131 | // 统计左边每个词的词频 132 | int[] leftFrequent = new int[leftWordClean.length]; 133 | for (int j = 0; j < leftWordClean.length; j++) { 134 | int leftcount = 0; 135 | for (int k = 0; k < leftword.length; k++) { 136 | if (leftWordClean[j].equals(leftword[k])) { 137 | leftcount++; 138 | } 139 | } 140 | leftFrequent[j] = leftcount; 141 | } 142 | // 计算左熵值 143 | double leftEntropy = 0; 144 | for (int i = 0; i < leftFrequent.length; i++) { 145 | double a = (double) leftFrequent[i] / count; 146 | double b = Math.log((double) leftFrequent[i] / count); 147 | leftEntropy += -a * b; 148 | // leftEntropy += 149 | // (-(double)(leftFrequent[i]/count))*Math.log((double)(leftFrequent[i]/count)); 150 | } 151 | // 计算右信息熵 152 | for (int i = 0; i < wordSegList.size(); i++) { 153 | String[] right = wordSegList.get(i); 154 | rightword[i] = right[2]; 155 | } 156 | // 去除右边重复的词 157 | List listsRight = new ArrayList(); 158 | for (int l = 0; l < rightword.length; l++) { 159 | listsRight.add(rightword[l]); 160 | } 161 | List tempListRight = new ArrayList(); 162 | for (String str : listsRight) { 163 | if (!(tempListRight.contains(str))) { 164 | tempListRight.add(str); 165 | } 166 | } 167 | String[] rightWordClean = new String[tempListRight.size()]; 168 | for (int m = 0; m < tempListRight.size(); m++) { 169 | rightWordClean[m] = tempListRight.get(m); 170 | } 171 | // 统计右边每个词的词频 172 | int[] rightFrequent = new int[rightWordClean.length]; 173 | for (int j = 0; j < rightWordClean.length; j++) { 174 | int rightcount = 0; 175 | for (int k = 0; k < rightword.length; k++) { 176 | if (rightWordClean[j].equals(rightword[k])) { 177 | rightcount++; 178 | } 179 | } 180 | rightFrequent[j] = rightcount; 181 | } 182 | // 计算右熵值 183 | double rightEntropy = 0.0; 184 | for (int i = 0; i < rightFrequent.length; i++) { 185 | double a = (double) rightFrequent[i] / count; 186 | double b = Math.log((double) rightFrequent[i] / count); 187 | rightEntropy += -a * b; 188 | // rightEntropy += 189 | // (-(double)(rightFrequent[i]/count))*Math.log((double)(rightFrequent[i]/count)); 190 | } 191 | // 计算词的总信息熵 192 | double wordEntropy = leftEntropy + rightEntropy; 193 | allEntropy[n] = wordEntropy; 194 | 195 | } 196 | 197 | String[] EntropyResult = new String[allEntropy.length]; 198 | 199 | //HashMap entropySet = new HashMap(); 200 | 201 | for (int i = 0; i < allEntropy.length; i++) { 202 | EntropyResult[i] = wordClean[i] + ":" + allEntropy[i]; 203 | //entropySet.put(wordClean[i],allEntropy[i]); 204 | //System.out.println(EntropyResult[i]); 205 | } 206 | 207 | return EntropyResult; 208 | } 209 | 210 | public static double getAllEntropy(String[] word){ 211 | 212 | double entropy = 0.0; 213 | 214 | String[] EntropyResult = CalculateWordEntropy(word); 215 | 216 | if(EntropyResult != null){ 217 | 218 | for(int i = 0; i < EntropyResult.length;i++){ 219 | 220 | String[] result = EntropyResult[i].split(":"); 221 | 222 | if(result[1].toString().trim() != null && result[1].toString().trim().length() > 0){ 223 | 224 | entropy = entropy + Double.valueOf(result[1].toString().trim()); 225 | 226 | } 227 | } 228 | } 229 | 230 | return entropy; 231 | 232 | } 233 | 234 | } 235 | --------------------------------------------------------------------------------