├── .gitignore ├── README.md └── ZhihuDown ├── .classpath ├── .gitignore ├── .project ├── .settings └── org.eclipse.jdt.core.prefs └── src ├── controller └── Spider.java └── model ├── FileReaderWriter.java └── Zhihu.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | 3 | # Package Files # 4 | *.jar 5 | *.war 6 | *.ear 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 知乎下巴 2 | ========== 3 | 4 | 笔记地址: 5 | [让我们一起来做一个知乎下巴噢耶](http://blog.csdn.net/pleasecallmewhy/article/details/17538809) -------------------------------------------------------------------------------- /ZhihuDown/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /ZhihuDown/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /ZhihuDown/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | ZhihuDown 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /ZhihuDown/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.7 12 | -------------------------------------------------------------------------------- /ZhihuDown/src/controller/Spider.java: -------------------------------------------------------------------------------- 1 | package controller; 2 | 3 | import java.util.ArrayList; 4 | 5 | import org.apache.http.client.methods.CloseableHttpResponse; 6 | import org.apache.http.client.methods.HttpGet; 7 | import org.apache.http.impl.client.CloseableHttpClient; 8 | import org.apache.http.impl.client.HttpClients; 9 | import org.apache.http.util.EntityUtils; 10 | import org.jsoup.Jsoup; 11 | import org.jsoup.nodes.Document; 12 | import org.jsoup.nodes.Element; 13 | import org.jsoup.select.Elements; 14 | 15 | import model.FileReaderWriter; 16 | import model.Zhihu; 17 | 18 | public class Spider { 19 | public static void main(String[] args) { 20 | 21 | String url = "http://www.zhihu.com/explore/recommendations"; 22 | // 获取知乎推荐首页 23 | String content = Spider.SendGet(url); 24 | 25 | // 获取推荐内容详细内容 26 | ArrayList myZhihu = Spider.GetRecommendations(content); 27 | 28 | // 写入文档 29 | for (Zhihu zhihu : myZhihu) { 30 | FileReaderWriter.writeIntoFile(zhihu.writeString(), 31 | "D:/知乎_编辑推荐.txt", true); 32 | } 33 | } 34 | 35 | //获取指定Url页面内容 36 | //采用http-client和http-core 4.3 jar包 37 | public static String SendGet(String url) { 38 | 39 | CloseableHttpClient client = HttpClients.createDefault(); 40 | try{ 41 | HttpGet request = new HttpGet(url); 42 | CloseableHttpResponse resp = client.execute(request); 43 | 44 | String result = EntityUtils.toString(resp.getEntity()); 45 | 46 | return result; 47 | }catch(Exception e){ 48 | e.printStackTrace(); 49 | }finally{ 50 | try{ 51 | client.close(); 52 | }catch(Exception e){ 53 | e.printStackTrace(); 54 | } 55 | 56 | } 57 | 58 | return null; 59 | } 60 | 61 | // 获取推荐内容详细内容url 62 | public static ArrayList GetRecommendations(String content) { 63 | 64 | ArrayList results = new ArrayList(); 65 | Document doc = Jsoup.parse(content); 66 | Elements items = doc.getElementsByClass("zm-item"); //推荐内容元素 67 | for(Element item:items){ 68 | Element h2TagEle = item.getElementsByTag("h2").first(); //推荐内容标题元素 69 | Element aTagEl = h2TagEle.getElementsByTag("a").first(); //推荐内容的Url超链接元素 70 | String href = aTagEl.attr("href"); //推荐内容url 71 | if(href.contains("question")){ //去除不规范url 72 | results.add(new Zhihu(href)); 73 | } 74 | } 75 | return results; 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /ZhihuDown/src/model/FileReaderWriter.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/callmewhy/ZhihuDown/4b8fee29cbffb45c8febe90fd74caa3365ec77fe/ZhihuDown/src/model/FileReaderWriter.java -------------------------------------------------------------------------------- /ZhihuDown/src/model/Zhihu.java: -------------------------------------------------------------------------------- 1 | package model; 2 | 3 | import java.util.ArrayList; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | 7 | import org.jsoup.Jsoup; 8 | import org.jsoup.nodes.Document; 9 | import org.jsoup.nodes.Element; 10 | import org.jsoup.select.Elements; 11 | 12 | import controller.Spider; 13 | 14 | public class Zhihu { 15 | public String question;// 问题 16 | public String questionDescription;// 问题描述 17 | public String zhihuUrl;// 链接地址 18 | public ArrayList answers;// 回答 19 | 20 | // 获取知乎内容 21 | public Zhihu(String url) { 22 | question = ""; 23 | questionDescription = ""; 24 | zhihuUrl = ""; 25 | answers = new ArrayList(); 26 | 27 | if (getRealUrl(url)) { 28 | 29 | // 获取推荐内容详细页面 30 | String content = Spider.SendGet(zhihuUrl); 31 | if(content != null){ 32 | Document doc = Jsoup.parse(content); 33 | // 获取标题,即用户发布的问题 34 | question = doc.title(); 35 | 36 | // 问题消息标书 37 | Element despElement = doc.getElementById("zh-question-detail"); 38 | if(despElement != null){ 39 | questionDescription = despElement.text(); 40 | } 41 | // 解答 42 | Elements ansItems = doc.getElementsByClass("zm-item-answer"); 43 | for(Element ansItem:ansItems){ 44 | Element textElement = ansItem.getElementsByClass("zm-item-rich-text").first(); 45 | if(despElement != null){ 46 | answers.add(textElement.text()); 47 | } 48 | } 49 | }else{ 50 | System.out.println("content is null"); 51 | } 52 | } 53 | } 54 | 55 | // ����url 56 | boolean getRealUrl(String url) { 57 | // 将http://www.zhihu.com/question/22355264/answer/21102139 58 | // 转换为http://www.zhihu.com/question/22355264 59 | Pattern pattern = Pattern.compile("question/(.*?)/"); 60 | Matcher matcher = pattern.matcher(url); 61 | if (matcher.find()) { 62 | zhihuUrl = "http://www.zhihu.com/question/" + matcher.group(1); 63 | } else { 64 | return false; 65 | } 66 | return true; 67 | } 68 | 69 | public String writeString() { 70 | // 将html页面转换为字符串 71 | String result = ""; 72 | result += "问题:" + question + "\r\n"; 73 | result += "描述:" + questionDescription + "\r\n"; 74 | result += "链接:" + zhihuUrl + "\r\n\r\n"; 75 | for (int i = 0; i < answers.size(); i++) { 76 | result += "回答" + i + ":" + answers.get(i) + "\r\n\r\n\r\n"; 77 | } 78 | result += "\r\n\r\n\r\n\r\n\r\n\r\n"; 79 | // 替换html换行符和其他字符 80 | result = result.replaceAll("
", "\r\n"); 81 | result = result.replaceAll("<.*?>", ""); 82 | return result; 83 | } 84 | 85 | @Override 86 | public String toString() { 87 | String result = ""; 88 | result += "问题:" + question + "\n"; 89 | result += "描述:" + questionDescription + "\n"; 90 | result += "链接:" + zhihuUrl + "\n"; 91 | result += "回答:" + answers.size() + "\n"; 92 | return result; 93 | } 94 | } 95 | --------------------------------------------------------------------------------