├── package.json ├── app.js └── README.md /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "micro-scraper", 3 | "version": "0.0.1", 4 | "private": true, 5 | "scripts": { 6 | "start": "node app.js" 7 | }, 8 | "dependencies": { 9 | "request": "*", 10 | "cheerio": "*" 11 | } 12 | } -------------------------------------------------------------------------------- /app.js: -------------------------------------------------------------------------------- 1 | var request = require('request'), 2 | cheerio = require('cheerio'), 3 | http = require('http'), 4 | url = require('url'); 5 | 6 | var host = 'http://baike.baidu.com/view/39744.htm'; 7 | 8 | var html = []; 9 | setInterval(scraper(host), 1000*60*15);//15 分钟更新一次 10 | function scraper (host) { 11 | request(host, function (error, response, data) { 12 | if (!error && response.statusCode == 200) { 13 | var $ = cheerio.load(data); 14 | var title = $('.title').first().text(), 15 | header = [], 16 | nav = [], 17 | body = []; 18 | //删除无用数据 19 | $('.title').remove(); 20 | $('.pic-info').remove(); 21 | $('.count').remove(); 22 | $('sup').remove(); 23 | //筛选有用数据 24 | $('#lemmaContent-0 .headline-1').each(function (i) { 25 | var str = '', 26 | $next = $(this).next(); 27 | while (!$next.hasClass('headline-1')&&(!$next.next().hasClass('clear'))) { 28 | if ($next.hasClass('headline-2')) { 29 | str += "
" + $next.text() + "
"; 30 | } else { 31 | str += "" + $next.text() + "
"; 32 | } 33 | $next = $next.next(); 34 | } 35 | header.push($(this).find('.headline-content').text()); 36 | nav.push("" + header[i] + ""); 37 | body.push(str); 38 | }); 39 | 40 | var len = $('#catalog-holder-0 .catalog-item').length;//获取 “目录” 条文数 41 | for (var i = 0; i < len; i++) { 42 | html[i] = "" + 43 | "" + 44 | "" + 45 | "" + 46 | "" + 47 | "" + $next.text() + "
"; 36 | } else { 37 | str += "" + $next.text() + "
"; 38 | } 39 | $next = $next.next(); 40 | } 41 | header.push($(this).find('.headline-content').text()); 42 | nav.push("" + header[i] + ""); 43 | body.push(str); 44 | }); 45 | 46 | var len = $('#catalog-holder-0 .catalog-item').length;//获取 “目录” 条文数 47 | for (var i = 0; i < len; i++) { 48 | html[i] = "" + 49 | "" + 50 | "" + 51 | "" + 52 | "" + 53 | "