├── package.json ├── app.js └── README.md /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "micro-scraper", 3 | "version": "0.0.1", 4 | "private": true, 5 | "scripts": { 6 | "start": "node app.js" 7 | }, 8 | "dependencies": { 9 | "request": "*", 10 | "cheerio": "*" 11 | } 12 | } -------------------------------------------------------------------------------- /app.js: -------------------------------------------------------------------------------- 1 | var request = require('request'), 2 | cheerio = require('cheerio'), 3 | http = require('http'), 4 | url = require('url'); 5 | 6 | var host = 'http://baike.baidu.com/view/39744.htm'; 7 | 8 | var html = []; 9 | setInterval(scraper(host), 1000*60*15);//15 分钟更新一次 10 | function scraper (host) { 11 | request(host, function (error, response, data) { 12 | if (!error && response.statusCode == 200) { 13 | var $ = cheerio.load(data); 14 | var title = $('.title').first().text(), 15 | header = [], 16 | nav = [], 17 | body = []; 18 | //删除无用数据 19 | $('.title').remove(); 20 | $('.pic-info').remove(); 21 | $('.count').remove(); 22 | $('sup').remove(); 23 | //筛选有用数据 24 | $('#lemmaContent-0 .headline-1').each(function (i) { 25 | var str = '', 26 | $next = $(this).next(); 27 | while (!$next.hasClass('headline-1')&&(!$next.next().hasClass('clear'))) { 28 | if ($next.hasClass('headline-2')) { 29 | str += "

" + $next.text() + "

"; 30 | } else { 31 | str += "

" + $next.text() + "

"; 32 | } 33 | $next = $next.next(); 34 | } 35 | header.push($(this).find('.headline-content').text()); 36 | nav.push("" + header[i] + ""); 37 | body.push(str); 38 | }); 39 | 40 | var len = $('#catalog-holder-0 .catalog-item').length;//获取 “目录” 条文数 41 | for (var i = 0; i < len; i++) { 42 | html[i] = "" + 43 | "" + 44 | "" + 45 | "" + 46 | "" + 47 | "" + title + "" + 48 | "" + 56 | "" + 57 | "" + 58 | "
" + header[i] + "
" + 59 | "" + 60 | "
" + body[i] + "
" + 61 | "" + 62 | ""; 63 | } 64 | } 65 | }); 66 | } 67 | 68 | http.createServer(function (req, res) { 69 | var path = url.parse(req.url).pathname; 70 | path = path == '/' ? 0 : parseInt(path.slice(1)); 71 | res.writeHead(200, {"Content-Type":"text/html"}); 72 | res.end(html[path]); 73 | }).listen(3000); 74 | 75 | console.log('Server running at localhost:3000'); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## micro-scraper ## 2 | 3 | Node.js 爬虫示例 (for:百度百科) 4 | 5 | ### app.js ### 6 | 7 | var request = require('request'), 8 | cheerio = require('cheerio'), 9 | http = require('http'), 10 | url = require('url'); 11 | 12 | var host = 'http://baike.baidu.com/view/39744.htm';//可修改为其他的百科地址 13 | 14 | var html = []; 15 | setInterval(scraper(host), 1000*60*15);//15 分钟更新一次 16 | function scraper (host) { 17 | request(host, function (error, response, data) { 18 | if (!error && response.statusCode == 200) { 19 | var $ = cheerio.load(data); 20 | var title = $('.title').first().text(), 21 | header = [], 22 | nav = [], 23 | body = []; 24 | //删除无用数据 25 | $('.title').remove(); 26 | $('.pic-info').remove(); 27 | $('.count').remove(); 28 | $('sup').remove(); 29 | //筛选有用数据 30 | $('#lemmaContent-0 .headline-1').each(function (i) { 31 | var str = '', 32 | $next = $(this).next(); 33 | while (!$next.hasClass('headline-1')&&(!$next.next().hasClass('clear'))) { 34 | if ($next.hasClass('headline-2')) { 35 | str += "

" + $next.text() + "

"; 36 | } else { 37 | str += "

" + $next.text() + "

"; 38 | } 39 | $next = $next.next(); 40 | } 41 | header.push($(this).find('.headline-content').text()); 42 | nav.push("" + header[i] + ""); 43 | body.push(str); 44 | }); 45 | 46 | var len = $('#catalog-holder-0 .catalog-item').length;//获取 “目录” 条文数 47 | for (var i = 0; i < len; i++) { 48 | html[i] = "" + 49 | "" + 50 | "" + 51 | "" + 52 | "" + 53 | "" + title + "" + 54 | "" + 62 | "" + 63 | "" + 64 | "
" + header[i] + "
" + 65 | "" + 66 | "
" + body[i] + "
" + 67 | "" + 68 | ""; 69 | } 70 | } 71 | }); 72 | } 73 | 74 | http.createServer(function (req, res) { 75 | var path = url.parse(req.url).pathname; 76 | path = path == '/' ? 0 : parseInt(path.slice(1)); 77 | res.writeHead(200, {"Content-Type":"text/html"}); 78 | res.end(html[path]); 79 | }).listen(3000); 80 | 81 | console.log('Server running at localhost:3000'); --------------------------------------------------------------------------------