├── QA-server.zip ├── README.md ├── spider-百科.java ├── spider-问答社区.zip ├── spider-领域网站.zip ├── 微信平台接口验证.java ├── 改进后的基于web领域术语抽取 ├── 通用方法-提取网页正文.zip ├── 通用方法-正文预处理.java ├── 问句匹配方法.zip ├── 问句检索方法.zip └── 项目-jar包.zip /QA-server.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/QA-server.zip -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Domain_QA 2 | 限定域问答系统包括:自动构建知识库、问句检索、基于微信平台搭建问答系统。本项目所有代码已开源。 3 | 用户通过简单配置,可以实现快速自动化搭建一个比较完备的领域知识库。另外,基于微信平台如何通过配置来搭建问答系统,具体操作见readme.md 4 | 5 | 1 申请微信公众号(订阅号/服务号) 6 | 7 | 2 使用云服务器/Tomcat+花生壳 搭建本地服务器 8 | 9 | 3 配置微信公众平台接口。并利用源码中的:微信接口认证代码进行验证 10 | 11 | 4 提供“领域名”+“领域网站”,利用源码中的:spider-领域网站源码爬取领域网站语料,为自动抽取领域实体词准备语料 12 | 13 | 5 将领域网站语料提取正文并做分词 14 | 15 | 6 采用word2vec对上述语料进行训练,得到模型model 16 | 17 | 7 利用model及种子术语seed,获取候选术语 18 | 19 | 8 计算候选术语和种子术语的similar相似度,设置相似度阈值为0.6,对候选术语进行过滤 20 | 21 | 9 得到领域术语文件 seedfile.txt 22 | 23 | 10 结合seedfile.txt ,采用源码中的:spider-百科、spider-问答社区爬虫源码自动构建该领域知识库。 24 | 25 | 11 基于lucene对关系型领域知识库建立倒排索引 26 | 27 | 12 获取用户问句,对用户问句进行分析,获取关键词及限定词等 28 | 29 | 13 根据倒排索引及关键词获取初期候选问题集 30 | 31 | 14 结合问句匹配特征对候选问题集重新排序 32 | 33 | 15 将各个部分进行连接测试,完成限定域问答系统的搭建 34 | 35 | 16 总结:自动化构建领域知识库、基于领域知识库及在线社区进行问句检索、调试微信服务器及本地服务器、对用户问句进行分析,共4大模块 36 | 37 | 开发环境 38 | 39 | 处理器:Intel(R)Core(TM)i5-2400 Cpu@ 3.10GHz 3.10Ghz 安装内存(RAM):6.00GB 40 | 41 | 系统类型:win32 硬盘:195GB 42 | 43 | 使用语言:JAVA、Python IDE:Eclipse、Myeclipse、python27 44 | 45 | 服务器:Tomcat、花生壳 客户端:微信客户端 46 | 47 | 数据库:Mysql 数据库管理:PhpAdmin 48 | 49 | 网络通信: HITSZ 校内网(单个模块运行)、基于花生壳的公网(系统运行) 50 | 51 | 测试方法:单元测试、集成测试、回归测试、系统测试、黑盒测试、白盒测试 52 | 53 | 其他:微信公众平台API、Jsoup解析包、Dom解析包、phpAdmin关系型数据库管理客户端、Gensim开源工具包、Hanlp 开源工具包、Lucene开源工具包、Github托管等 54 | 55 | 文件指定位置: 56 | 57 | E:\QA_database\website-knowledge 58 | 59 | E:\QA_database\website-正文 60 | 61 | E:\QA_database\website-正文预处理 62 | 63 | E:\QA_database\QACommunity_搜搜问问 64 | 65 | E:\QA_database\baike_infobox知识 66 | 67 | E:\QA_database\baike 68 | 69 | 邮箱:steady_pace@126.com liqianqian 如有其它问题,可联系作者。 70 | -------------------------------------------------------------------------------- /spider-百科.java: -------------------------------------------------------------------------------- 1 | package segment; 2 | import java.io.BufferedReader; 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | 7 | import org.apache.commons.httpclient.HttpClient; 8 | import org.apache.commons.httpclient.methods.GetMethod; 9 | import org.jsoup.Jsoup; 10 | import org.jsoup.nodes.Document; 11 | import org.jsoup.nodes.Element; 12 | import org.jsoup.select.Elements; 13 | 14 | 15 | public class Baike { 16 | //?��????url 17 | //�?�?请�?�??��?? 18 | 19 | 20 | 21 | public static void fetch(String url) throws IOException 22 | { 23 | 24 | HttpClient httpClient = new HttpClient(); 25 | GetMethod getMethod = new GetMethod(url); 26 | int statusCode = httpClient.executeMethod(getMethod); 27 | if(statusCode >= 200 && statusCode < 400) 28 | { 29 | String result; 30 | result = getMethod.getResponseBodyAsString(); //getMethod.getRespon 31 | String response = new String(getMethod.getResponseBodyAsString().getBytes("ISO-8859-1"),"UTF-8"); 32 | 33 | //打印返回的信息 34 | 35 | System.out.println(response); 36 | 37 | 38 | // System.out.println(result); 39 | Document doc = Jsoup.parse(response);//result.toString());//(temp); 40 | String title =doc.head().select("title").text(); //doc2.head().select("title").text(); 41 | System.out.println("title:"+title); 42 | 43 | //抽取所有 dt,所有dd 44 | 45 | Elements ListDiv = doc.getElementsByAttributeValue("class","basicInfo-item name"); 46 | for (Element element :ListDiv) { 47 | System.out.println(element.html());} 48 | 49 | Elements ListDiv2 = doc.getElementsByAttributeValue("class","basicInfo-item value"); 50 | for (Element element :ListDiv2) { 51 | System.out.println(element.html());} 52 | 53 | 54 | //提取正文 55 | //parse(response); 56 | System.out.println( TextExtract.parse(response)); 57 | // System.out.println("result"+result); 58 | getMethod.releaseConnection(); 59 | //saveFile("D:\\�?�?天�?\\"+temp+".txt", result.toString()); 60 | } 61 | } 62 | 63 | public static String InputStream2String(InputStream in_st,String charset) throws IOException{ 64 | BufferedReader buff = new BufferedReader(new InputStreamReader(in_st, charset)); 65 | StringBuffer res = new StringBuffer(); 66 | String line = ""; 67 | while((line = buff.readLine()) != null){ 68 | res.append(line); 69 | } 70 | return res.toString(); 71 | } 72 | 73 | 74 | 75 | public static String toGbkString(String s) 76 | { 77 | StringBuffer sb = new StringBuffer(); 78 | for(int i = 0; i < s.length(); i++) 79 | { 80 | char c = s.charAt(i); 81 | if(c >= 0 && c <= 255){ 82 | sb.append(c); 83 | }else{ 84 | byte[] b; 85 | try{ 86 | b = String.valueOf(c).getBytes("UTF-8"); 87 | }catch (Exception e) { 88 | // TODO: handle exception 89 | e.printStackTrace(); 90 | b = new byte[0]; 91 | } 92 | for(int j = 0; j < b.length; j++){ 93 | int k = b[j]; 94 | if(k < 0) 95 | k+=256; 96 | sb.append("%"+Integer.toHexString(k).toUpperCase()); 97 | } 98 | } 99 | } 100 | return sb.toString(); 101 | } 102 | 103 | 104 | 105 | 106 | 107 | public static void main(String[] args) throws IOException 108 | { 109 | String key="大丽花"; 110 | String s=toGbkString(key); 111 | System.out.println(s); 112 | 113 | String url="http://baike.baidu.com/item/"+s; 114 | fetch(url); 115 | 116 | } 117 | 118 | 119 | 120 | 121 | } 122 | -------------------------------------------------------------------------------- /spider-问答社区.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/spider-问答社区.zip -------------------------------------------------------------------------------- /spider-领域网站.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/spider-领域网站.zip -------------------------------------------------------------------------------- /微信平台接口验证.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/微信平台接口验证.java -------------------------------------------------------------------------------- /改进后的基于web领域术语抽取: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/改进后的基于web领域术语抽取 -------------------------------------------------------------------------------- /通用方法-提取网页正文.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/通用方法-提取网页正文.zip -------------------------------------------------------------------------------- /通用方法-正文预处理.java: -------------------------------------------------------------------------------- 1 | package segment; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FileOutputStream; 8 | import java.io.IOException; 9 | import java.io.InputStreamReader; 10 | import java.io.OutputStreamWriter; 11 | import java.io.UnsupportedEncodingException; 12 | import java.nio.charset.Charset; 13 | import java.nio.file.Files; 14 | import java.nio.file.Paths; 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | 18 | public class WordSeg { 19 | 20 | private static final List DIC = new ArrayList<>(); 21 | private static int MAX_LENGTH; 22 | static{ 23 | try { 24 | System.out.println("开始初始化词典"); 25 | int max=1; 26 | int count=0; 27 | List lines = Files.readAllLines(Paths.get("D:/dic.txt"), Charset.forName("utf-8")); 28 | for(String line : lines){ 29 | DIC.add(line); 30 | count++; 31 | if(line.length()>max){ 32 | max=line.length(); 33 | } 34 | } 35 | MAX_LENGTH = max; 36 | System.out.println("完成初始化词典,词数目:"+count); 37 | System.out.println("最大分词长度:"+MAX_LENGTH); 38 | } catch (IOException ex) { 39 | System.err.println("词典装载失败:"+ex.getMessage()); 40 | } 41 | 42 | } 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | public static void saveFile_zhuijia(String fileName,String content) throws IOException 52 | { 53 | File file =new File(fileName); 54 | if(!file.exists()) 55 | { 56 | // System.out.println("娑�锟斤拷锟斤拷); 57 | System.out.println(file.createNewFile()); 58 | file.createNewFile(); 59 | } 60 | 61 | 62 | FileOutputStream fos = new FileOutputStream(file,true); 63 | OutputStreamWriter out =new OutputStreamWriter(fos,"utf8");//(fos,"gb2312"); 64 | BufferedWriter bw = new BufferedWriter(out); 65 | bw.write(content); 66 | bw.write("\r\n"); 67 | 68 | 69 | 70 | 71 | bw.flush(); 72 | // System.out.println("锟斤拷缂�锟斤拷锟�"); 73 | 74 | } 75 | 76 | 77 | public static ArrayList getListFiles(Object obj) { 78 | File directory = null; 79 | if (obj instanceof File) { 80 | directory = (File) obj; 81 | } else { 82 | directory = new File(obj.toString()); 83 | } 84 | ArrayList files = new ArrayList(); 85 | if (directory.isFile()) { 86 | files.add(directory); 87 | return files; 88 | } else if (directory.isDirectory()) { 89 | File[] fileArr = directory.listFiles(); 90 | for (int i = 0; i < fileArr.length; i++) { 91 | File fileOne = fileArr[i]; 92 | files.addAll(getListFiles(fileOne)); 93 | } 94 | } 95 | return files; 96 | } 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | public static void main(String[] args) throws IOException{ 108 | String text = "杨尚川是APDPlat应用级产品开发平台的作者"; 109 | 110 | //遍历所有所有的文件 111 | //每个文件,读一行切分一行。并写入文件中 112 | 113 | 114 | 115 | WordSeg w=new WordSeg(); 116 | 117 | 118 | 119 | // ArrayList files=getListFiles("D:\\TFIDF"); 120 | ArrayList files=getListFiles("D:\\手机-正文"); 121 | System.out.println(files); 122 | //UseDemo t=new UseDemo(); 123 | 124 | for(int i=0;i seg(String text){ 220 | List result = new ArrayList<>(); 221 | while(text.length()>0){ 222 | int len=MAX_LENGTH; 223 | if(text.length()