├── README.md ├── package.json ├── fetch.js ├── setting.js └── spider.js /README.md: -------------------------------------------------------------------------------- 1 | # NodeSpide 2 | 一个爬取知乎问题“眼睛好看是一种什么样的体验?”=>https://www.zhihu.com/question/34937418 下所有回答里的照片的小爬虫~ 3 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "lin", 10 | "license": "ISC", 11 | "dependencies": { 12 | "async": "^1.5.2", 13 | "cheerio": "^0.19.0", 14 | "eventproxy": "^0.3.4", 15 | "superagent": "^1.7.0" 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /fetch.js: -------------------------------------------------------------------------------- 1 | var request = require("superagent"); 2 | 3 | module.exports = { 4 | fetch_data_get : function( url, query_params ){ 5 | return new Promise(( resolve, reject ) => { 6 | request .get(url) 7 | .set( "Accept", "application/json" ) 8 | .query( query_params ) 9 | .end(( error, result ) => { 10 | error ? reject( error ) : resolve( result ); 11 | }); 12 | }); 13 | }, 14 | fetch_data_post : function( url, post_data, header ){ 15 | return new Promise(( resolve, reject ) => { 16 | request .post( url ) 17 | .set( header ) 18 | .send( post_data ) 19 | .end(( error, result ) => { 20 | error ? reject( error ) : resolve( result ); 21 | }); 22 | }); 23 | } 24 | } -------------------------------------------------------------------------------- /setting.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | header : { 3 | "Accept" : "*/*", 4 | "Accept-Encoding" : "gzip, deflate, br", 5 | "Accept-Language" : "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", 6 | "Connection" : "keep-alive", 7 | "Content-Length" : "132", 8 | "Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8", 9 | "Host" : "www.zhihu.com", 10 | "Referer" : "https://www.zhihu.com/question/34937418", 11 | "User-Agent" : "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0", 12 | "X-Requested-With" : "XMLHttpRequest" 13 | }, 14 | firstLink : "https://www.zhihu.com/question/34937418", 15 | ajaxLink : "https://www.zhihu.com/node/QuestionAnswerListV2", 16 | post_data_h : "method=next¶ms=%7B%22url_token%22%3A34937418%2C%22pagesize%22%3A20%2C%22offset%22%3A", 17 | post_data_f : "%7D&_xsrf=98360a2df02783902146dee374772e51", 18 | // 发送ajax间隔时间 19 | ajax_timeout : 5, 20 | // 下载图片速度 21 | download_v : 100 22 | } -------------------------------------------------------------------------------- /spider.js: -------------------------------------------------------------------------------- 1 | let async = require("async"); 2 | let cheerio = require("cheerio"); 3 | let fs = require("fs"); 4 | let path = require("path"); 5 | 6 | let setting = require("./setting.js"); 7 | 8 | let fetch_data_get = require("./fetch.js").fetch_data_get; 9 | let fetch_data_post = require("./fetch.js").fetch_data_post; 10 | 11 | // 存储所有图片链接的数组 12 | let photos=[ ]; 13 | let count = 0; 14 | 15 | // 获取首屏所有图片链接 16 | function getInitUrlList(){ 17 | fetch_data_get( setting.firstLink, { } ) 18 | .then(( result ) => { 19 | let $ = cheerio.load( result.text ); 20 | let answerList = $( ".zm-item-answer" ); 21 | answerList.map(function( i, answer ){ 22 | let images = $( answer ).find( '.zm-item-rich-text img' ); 23 | images.map(function( i, image ){ 24 | photos.push( $(image).attr( "src" ) ); 25 | }); 26 | }); 27 | console.log( "已成功抓取" + photos.length + "张图片的链接" ); 28 | getIAjaxUrlList( 20 ); 29 | }) 30 | .catch(( error ) => console.log( error )); 31 | } 32 | 33 | // 每隔300毫秒模拟发送ajax请求,并获取请求结果中所有的图片链接 34 | function getIAjaxUrlList( offset ){ 35 | fetch_data_post( setting.ajaxLink, setting.post_data_h + offset + setting.post_data_f, setting.header ) 36 | .then(( result ) => { 37 | let response = JSON.parse( result.text ); 38 | if( offset == 100 ) { 39 | // 把所有的数组元素拼接在一起 40 | let $ = cheerio.load( response.msg.join("") ); 41 | let answerList = $( ".zm-item-answer" ); 42 | answerList.map(function( i ,answer ){ 43 | let images = $( answer ).find( '.zm-item-rich-text img' ); 44 | images.map(function( i, image ){ 45 | photos.push( $(image).attr("src") ); 46 | }); 47 | }); 48 | setTimeout(function() { 49 | offset += 20; 50 | console.log( "已成功抓取 " + photos.length + " 张图片的链接" ); 51 | getIAjaxUrlList( offset ); 52 | }, setting.ajax_timeout) 53 | } else { 54 | console.log( "图片链接全部获取完毕,一共有" + photos.length + "条图片链接" ); 55 | return downloadImg( setting.download_v ); 56 | } 57 | }) 58 | .catch(( error ) => console.log( error )); 59 | } 60 | 61 | function downloadImg( asyncNum ){ 62 | // 有一些图片链接地址不完整没有“http:”头部,帮它们拼接完整 63 | for( let i=0; i { 72 | let fileName = path.basename( photo ); 73 | fs.writeFile( "./img/" + fileName, result.body, function( err ){ 74 | if( err ) { 75 | console.log( err ); 76 | } else { 77 | count ++; 78 | console.log( count + " done " ); 79 | callback( null, fileName ); 80 | } 81 | }) 82 | }) 83 | .catch(( error ) => console.log( error )) 84 | },function( err, result ){ 85 | if( err ) { 86 | console.log( err ); 87 | } else { 88 | console.log( " all right ! " ); 89 | console.log( result ); 90 | } 91 | }) 92 | } 93 | 94 | getInitUrlList(); 95 | --------------------------------------------------------------------------------