├── QA-server.zip
├── README.md
├── spider-百科.java
├── spider-问答社区.zip
├── spider-领域网站.zip
├── 微信平台接口验证.java
├── 改进后的基于web领域术语抽取
├── 通用方法-提取网页正文.zip
├── 通用方法-正文预处理.java
├── 问句匹配方法.zip
├── 问句检索方法.zip
└── 项目-jar包.zip


/QA-server.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/QA-server.zip


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Domain_QA
 2 | 限定域问答系统包括：自动构建知识库、问句检索、基于微信平台搭建问答系统。本项目所有代码已开源。
 3 | 用户通过简单配置，可以实现快速自动化搭建一个比较完备的领域知识库。另外，基于微信平台如何通过配置来搭建问答系统，具体操作见readme.md
 4 | 
 5 | 1 申请微信公众号（订阅号/服务号）
 6 | 
 7 | 2 使用云服务器/Tomcat+花生壳 搭建本地服务器
 8 | 
 9 | 3 配置微信公众平台接口。并利用源码中的：微信接口认证代码进行验证
10 | 
11 | 4 提供“领域名”+“领域网站”，利用源码中的：spider-领域网站源码爬取领域网站语料，为自动抽取领域实体词准备语料
12 | 
13 | 5 将领域网站语料提取正文并做分词
14 | 
15 | 6 采用word2vec对上述语料进行训练，得到模型model
16 | 
17 | 7 利用model及种子术语seed，获取候选术语
18 | 
19 | 8 计算候选术语和种子术语的similar相似度，设置相似度阈值为0.6,对候选术语进行过滤
20 | 
21 | 9 得到领域术语文件 seedfile.txt
22 | 
23 | 10 结合seedfile.txt ,采用源码中的：spider-百科、spider-问答社区爬虫源码自动构建该领域知识库。
24 | 
25 | 11 基于lucene对关系型领域知识库建立倒排索引
26 | 
27 | 12 获取用户问句，对用户问句进行分析，获取关键词及限定词等
28 | 
29 | 13 根据倒排索引及关键词获取初期候选问题集
30 | 
31 | 14 结合问句匹配特征对候选问题集重新排序
32 | 
33 | 15 将各个部分进行连接测试，完成限定域问答系统的搭建
34 | 
35 | 16 总结：自动化构建领域知识库、基于领域知识库及在线社区进行问句检索、调试微信服务器及本地服务器、对用户问句进行分析，共4大模块
36 | 
37 | 开发环境
38 | 
39 | 处理器:Intel(R)Core(TM)i5-2400 Cpu@ 3.10GHz 3.10Ghz                  安装内存（RAM）：6.00GB
40 | 
41 | 系统类型：win32                                                       硬盘：195GB
42 | 
43 | 使用语言：JAVA、Python                                                IDE：Eclipse、Myeclipse、python27
44 | 
45 | 服务器：Tomcat、花生壳                                                 客户端：微信客户端
46 | 
47 | 数据库：Mysql                                                         数据库管理：PhpAdmin
48 | 
49 | 网络通信： HITSZ 校内网（单个模块运行）、基于花生壳的公网（系统运行）
50 | 
51 | 测试方法：单元测试、集成测试、回归测试、系统测试、黑盒测试、白盒测试
52 | 
53 | 其他：微信公众平台API、Jsoup解析包、Dom解析包、phpAdmin关系型数据库管理客户端、Gensim开源工具包、Hanlp 开源工具包、Lucene开源工具包、Github托管等
54 | 
55 | 文件指定位置：
56 | 
57 | E:\QA_database\website-knowledge
58 | 
59 | E:\QA_database\website-正文
60 | 
61 | E:\QA_database\website-正文预处理
62 | 
63 | E:\QA_database\QACommunity_搜搜问问
64 | 
65 | E:\QA_database\baike_infobox知识
66 | 
67 | E:\QA_database\baike
68 | 
69 | 邮箱：steady_pace@126.com liqianqian  如有其它问题，可联系作者。
70 | 


--------------------------------------------------------------------------------
/spider-百科.java:
--------------------------------------------------------------------------------
  1 | package segment;
  2 | import java.io.BufferedReader;
  3 | import java.io.IOException;
  4 | import java.io.InputStream;
  5 | import java.io.InputStreamReader;
  6 | 
  7 | import org.apache.commons.httpclient.HttpClient;
  8 | import org.apache.commons.httpclient.methods.GetMethod;
  9 | import org.jsoup.Jsoup;
 10 | import org.jsoup.nodes.Document;
 11 | import org.jsoup.nodes.Element;
 12 | import org.jsoup.select.Elements;
 13 | 
 14 | 
 15 | public class Baike {
 16 | 	//?��????url
 17 | 	//�?�?请�?�??��??
 18 | 	
 19 | 	
 20 | 	
 21 | 	public static void  fetch(String url) throws IOException
 22 | 	{
 23 |   
 24 | 		HttpClient httpClient = new HttpClient();		
 25 | 		GetMethod getMethod = new GetMethod(url);	
 26 | 		int statusCode = httpClient.executeMethod(getMethod);
 27 | 		if(statusCode >= 200 && statusCode < 400)				
 28 | 			{				
 29 | 				String result;				
 30 | 				result = getMethod.getResponseBodyAsString();    //getMethod.getRespon	
 31 | 				  String response =   new String(getMethod.getResponseBodyAsString().getBytes("ISO-8859-1"),"UTF-8");    
 32 | 
 33 | 			       //打印返回的信息    
 34 | 
 35 | 			     System.out.println(response);    
 36 | 				
 37 | 				
 38 | 			//	 System.out.println(result);
 39 | 				 Document doc = Jsoup.parse(response);//result.toString());//(temp);	
 40 | 		  		 String title =doc.head().select("title").text(); //doc2.head().select("title").text(); 
 41 | 		  		 System.out.println("title:"+title);
 42 | 		  		 
 43 | 		  		 //抽取所有 dt,所有dd
 44 | 
 45 | 		  		 Elements ListDiv = doc.getElementsByAttributeValue("class","basicInfo-item name");
 46 | 		            for (Element element :ListDiv) {
 47 | 		                System.out.println(element.html());}
 48 | 		            
 49 | 		            Elements ListDiv2 = doc.getElementsByAttributeValue("class","basicInfo-item value");
 50 | 		            for (Element element :ListDiv2) {
 51 | 		                System.out.println(element.html());}
 52 | 		            
 53 | 		            
 54 | 		            //提取正文 
 55 | 		           //parse(response);
 56 | 		            System.out.println( TextExtract.parse(response));
 57 | 		  		// System.out.println("result"+result);
 58 | 		  		 getMethod.releaseConnection();
 59 |                  //saveFile("D:\\�?�?天�?\\"+temp+".txt", result.toString());
 60 | 			}
 61 | 	}
 62 | 	
 63 | 	public static String InputStream2String(InputStream in_st,String charset) throws IOException{
 64 |         BufferedReader buff = new BufferedReader(new InputStreamReader(in_st, charset));
 65 |         StringBuffer res = new StringBuffer();
 66 |         String line = "";
 67 |         while((line = buff.readLine()) != null){
 68 |             res.append(line);
 69 |         }
 70 |         return res.toString();
 71 |     }
 72 | 	
 73 | 	
 74 | 	
 75 | 	public static String toGbkString(String s)
 76 | 	{
 77 | 		StringBuffer sb = new StringBuffer();
 78 | 		for(int i = 0; i < s.length(); i++)
 79 | 		{
 80 | 		char c = s.charAt(i);
 81 | 		if(c >= 0 && c <= 255){
 82 | 			sb.append(c);
 83 | 			}else{
 84 | 				byte[] b;
 85 | 				try{
 86 | 					b = String.valueOf(c).getBytes("UTF-8");
 87 | 				}catch (Exception e) {
 88 | 					// TODO: handle exception
 89 | 				e.printStackTrace();
 90 | 					b = new byte[0];
 91 | 				}
 92 | 			for(int j = 0; j < b.length; j++){
 93 | 					int k = b[j];
 94 | 					if(k < 0)
 95 | 						k+=256;
 96 | 					sb.append("%"+Integer.toHexString(k).toUpperCase());
 97 | 				}
 98 | 			}
 99 | 	}
100 | 		return sb.toString();
101 | 	}
102 | 	
103 | 	
104 | 	
105 | 	
106 | 	
107 | 	public static void main(String[] args) throws IOException 
108 | 	{
109 | 		String key="大丽花";
110 | 		String s=toGbkString(key);
111 | 		System.out.println(s);
112 | 	
113 | 	        String url="http://baike.baidu.com/item/"+s;
114 | 	        fetch(url);
115 | 	        		
116 | 	}
117 | 	
118 | 	
119 | 	
120 | 
121 | }
122 | 


--------------------------------------------------------------------------------
/spider-问答社区.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/spider-问答社区.zip


--------------------------------------------------------------------------------
/spider-领域网站.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/spider-领域网站.zip


--------------------------------------------------------------------------------
/微信平台接口验证.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/微信平台接口验证.java


--------------------------------------------------------------------------------
/改进后的基于web领域术语抽取:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/改进后的基于web领域术语抽取


--------------------------------------------------------------------------------
/通用方法-提取网页正文.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/通用方法-提取网页正文.zip


--------------------------------------------------------------------------------
/通用方法-正文预处理.java:
--------------------------------------------------------------------------------
  1 | package segment;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.BufferedWriter;
  5 | import java.io.File;
  6 | import java.io.FileInputStream;
  7 | import java.io.FileOutputStream;
  8 | import java.io.IOException;
  9 | import java.io.InputStreamReader;
 10 | import java.io.OutputStreamWriter;
 11 | import java.io.UnsupportedEncodingException;
 12 | import java.nio.charset.Charset;
 13 | import java.nio.file.Files;
 14 | import java.nio.file.Paths;
 15 | import java.util.ArrayList;
 16 | import java.util.List;
 17 | 
 18 | public class WordSeg {
 19 | 
 20 | 	private static final List<String> DIC = new ArrayList<>();
 21 |     private static int MAX_LENGTH;
 22 |     static{
 23 |         try {
 24 |             System.out.println("开始初始化词典");
 25 |             int max=1;
 26 |             int count=0;
 27 |             List<String> lines = Files.readAllLines(Paths.get("D:/dic.txt"), Charset.forName("utf-8"));
 28 |             for(String line : lines){
 29 |                 DIC.add(line);
 30 |                 count++;
 31 |                 if(line.length()>max){
 32 |                     max=line.length();
 33 |                 }
 34 |             }
 35 |             MAX_LENGTH = max;
 36 |             System.out.println("完成初始化词典，词数目："+count);
 37 |             System.out.println("最大分词长度："+MAX_LENGTH);
 38 |         } catch (IOException ex) {
 39 |             System.err.println("词典装载失败:"+ex.getMessage());
 40 |         }
 41 |          
 42 |     }
 43 |     
 44 |     
 45 |     
 46 |     
 47 |     
 48 |     
 49 |     
 50 |     
 51 |     public static void saveFile_zhuijia(String fileName,String content) throws IOException
 52 | 	{
 53 | 	    File file =new File(fileName);
 54 | 	    if(!file.exists())
 55 | 	    {       
 56 | 	       // System.out.println("娑�锟斤拷锟斤拷);
 57 | 	        System.out.println(file.createNewFile());
 58 | 	        file.createNewFile();
 59 | 	    }
 60 | 
 61 | 	    
 62 | 	        FileOutputStream fos = new FileOutputStream(file,true);
 63 | 	        OutputStreamWriter out =new OutputStreamWriter(fos,"utf8");//(fos,"gb2312");
 64 | 	        BufferedWriter bw = new BufferedWriter(out);
 65 | 	        bw.write(content);	       	        
 66 | 	        bw.write("\r\n");
 67 | 	        
 68 | 	        
 69 | 	       
 70 | 	       
 71 | 	        bw.flush();
 72 | 	      //  System.out.println("锟斤拷缂�锟斤拷锟�");
 73 | 
 74 | 	}
 75 |     
 76 | 
 77 |   	 public static ArrayList<File> getListFiles(Object obj) {  
 78 |   	        File directory = null;  
 79 |   	        if (obj instanceof File) {  
 80 |   	            directory = (File) obj;  
 81 |   	        } else {  
 82 |   	            directory = new File(obj.toString());  
 83 |   	        }  
 84 |   	        ArrayList<File> files = new ArrayList<File>();  
 85 |   	        if (directory.isFile()) {  
 86 |   	            files.add(directory);  
 87 |   	            return files;  
 88 |   	        } else if (directory.isDirectory()) {  
 89 |   	            File[] fileArr = directory.listFiles();  
 90 |   	            for (int i = 0; i < fileArr.length; i++) {  
 91 |   	                File fileOne = fileArr[i];  
 92 |   	                files.addAll(getListFiles(fileOne));  
 93 |   	            }  
 94 |   	        }  
 95 |   	        return files;  
 96 |   	    }  
 97 |   	 
 98 |     
 99 |     
100 |     
101 |     
102 |     
103 |     
104 |     
105 |     
106 |     
107 |     public static void main(String[] args) throws IOException{
108 |         String text = "杨尚川是APDPlat应用级产品开发平台的作者";  
109 |         
110 |         //遍历所有所有的文件 
111 |         //每个文件，读一行切分一行。并写入文件中
112 |         
113 |         
114 |         
115 |         WordSeg w=new WordSeg();
116 |         
117 |         
118 |         
119 |        // ArrayList<File> files=getListFiles("D:\\TFIDF");  
120 |         ArrayList<File> files=getListFiles("D:\\手机-正文");  
121 |         System.out.println(files);
122 | 	//UseDemo t=new UseDemo();
123 |         
124 |     for(int i=0;i<files.size()-1;i++){
125 |        
126 |         String s2=files.get(i).toString();//"D:\\缁憋拷锟�\\httpwwwchinavegancomdaleiDWJFDWYQ.html";
127 |            System.out.println(s2);
128 |            
129 | 		//t.solve(s2,i);
130 |           String temp= readTxtFile(s2);
131 |           
132 |         //  String c= new String(temp.getBytes("ISO-8859-1"), "utf8");
133 |           System.out.println("待切分文本："+temp);
134 |           System.out.println(temp);
135 |           
136 |           //������棰� 
137 |           
138 |          // String sbyte=getEncoding(temp);
139 |         //  String a = new String(temp.getBytes(sbyte),"utf8");
140 |        //   System.out.println(TextExtract.parse(temp)+"\n\n\n\n\n\n");
141 |         //  System.out.println("##############################");
142 |           
143 |           
144 |           //���ユ��浠�
145 |           //saveFile_zhuijia("D:\\TFIDF-分词后\\"+i+".txt",seg(temp));
146 |           
147 |         //  w.saveFile_zhuijia("D:\\TFIDF-分词后\\"+i+".txt",temp);
148 |           w.saveFile_zhuijia("D:\\2\\"+i+".txt",temp);
149 |     }
150 | 
151 |         
152 |         
153 |         
154 |         
155 |         
156 |         
157 |       
158 |     }
159 |     
160 |     
161 |     
162 |     
163 |     
164 |     
165 |     
166 |  	 public static String readTxtFile(String filePath){
167 | 	   		
168 |          try {
169 |         	 StringBuilder build=new StringBuilder();
170 |                  String encoding="utf-8";//"GB2312";
171 |                  File file=new File(filePath);
172 |                  
173 |                  if(file.isFile() && file.exists()){ //�ゆ����浠舵����瀛���
174 |                      InputStreamReader read = new InputStreamReader(
175 |                      new FileInputStream(file),encoding);//�����扮����煎�
176 |                      BufferedReader bufferedReader = new BufferedReader(read);
177 |                      String lineTxt = null;
178 |                      while((lineTxt = bufferedReader.readLine()) != null){
179 |                         // System.out.println(lineTxt);
180 |                     	 lineTxt=lineTxt+"\n";
181 |                        //  build.append(lineTxt);
182 |                         // System.out.println(seg(lineTxt));
183 |                          for(String temp:seg(lineTxt))
184 |                          {
185 |                         	 build.append(temp+" ");
186 |                         	 
187 |                          }
188 |                      }
189 |                      
190 |                      read.close();
191 |                      
192 |                     
193 |                      return   build.toString();
194 |                      
195 |                     
196 |                      
197 |                      
198 |          }else{
199 |              System.out.println("�句��版��瀹�����浠�");
200 |          }
201 |          } catch (Exception e) {
202 |              System.out.println("璇诲����浠跺��瀹瑰�洪��");
203 |              e.printStackTrace();
204 |          }
205 | 		
206 |       return null;
207 |      }
208 |    	 
209 |    	 
210 |    	 
211 |     
212 |     
213 |     
214 |     
215 |     
216 |     
217 |     
218 |     
219 |     public static List<String> seg(String text){        
220 |         List<String> result = new ArrayList<>();
221 |         while(text.length()>0){
222 |             int len=MAX_LENGTH;
223 |             if(text.length()<len){
224 |                 len=text.length();
225 |             }
226 |             //取指定的最大长度的文本去词典里面匹配
227 |             String tryWord = text.substring(0, 0+len);
228 |             while(!DIC.contains(tryWord)){
229 |                 //如果长度为一且在词典中未找到匹配，则按长度为一切分
230 |                 if(tryWord.length()==1){
231 |                     break;
232 |                 }
233 |                 //如果匹配不到，则长度减一继续匹配
234 |                 tryWord=tryWord.substring(0, tryWord.length()-1);
235 |             }
236 |             result.add(tryWord);
237 |             //从待分词文本中去除已经分词的文本
238 |             text=text.substring(tryWord.length());
239 |         }
240 |         return result;
241 |     }
242 | }
243 | 


--------------------------------------------------------------------------------
/问句匹配方法.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/问句匹配方法.zip


--------------------------------------------------------------------------------
/问句检索方法.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/问句检索方法.zip


--------------------------------------------------------------------------------
/项目-jar包.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/steady-pace/Domain_QA/e7b186536ff114b95fbe58efda38cecfd2285a28/项目-jar包.zip


--------------------------------------------------------------------------------