├── .babelrc ├── README.MD ├── app.js ├── app ├── controllers │ └── rental.server.controller.js ├── models │ └── rental.server.model.js └── routes │ └── rental.server.routes.js ├── bin └── www ├── config ├── config.js ├── env │ └── development.js ├── express.js └── mongoose.js ├── package.json └── views ├── css └── index.css ├── image ├── 1.png ├── 2.png ├── 3.png ├── 4.png └── house.png ├── index.html └── js ├── index.js └── jquery-3.0.0.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | "es2015" 4 | ], 5 | "plugins": [] 6 | } -------------------------------------------------------------------------------- /README.MD: -------------------------------------------------------------------------------- 1 | # Rental app based on Node.js 2 | 3 | ## 运行 How to run 4 | 5 | ```bash 6 | cd Rental 7 | 8 | # install dependencies 9 | npm install 10 | 11 | # app running 12 | npm start 13 | ``` 14 | 15 | 16 | open localhost:2000 17 | 18 | 19 | ## 简介 Introduction 20 | 21 | 基于Node.js的爬虫系统。 22 | 23 | 爬取58同城上的杭州租房信息,并用百度地图进行可视化显示,方便用户快速查找自己附近的租房信息。 24 | 25 | A crawler system based on Node.js. 26 | 27 | The system will crawl rental information of 58 tongcheng, and Baidu map for visual display, user-friendly quickly find their own near the rental information. 28 | 29 | 30 | 31 |  32 | 33 | 34 |  35 | 36 | 37 |  38 | 39 | 40 |  41 | 42 | 43 | ## 目录结构 Directory Structure 44 |
45 | ├─ package.json # 项目配置 project configuration 46 | ├─ README.md # 项目说明 project instruction 47 | ├─ app.js # 项目入口文件 Project entry file 48 | ├─ node_modules # npm依赖包 npm dependent 49 | ├─ .babelrc # babel配置 babel configuration 50 | │ 51 | │ 52 | ├─ app # node后端业务 node business 53 | │ 54 | │ controllers # 控制器 controller 55 | │ models # 数据模型 data model 56 | │ routes # 路由 route 57 | │ 58 | │ 59 | ├─ bin # node启动 node start 60 | │ 61 | │ 62 | └─ views # 前端代码 front end code 63 | │ css # css文件 css 64 | │ js # js文件 js 65 | │ image # 图片文件 image 66 | │ index.html # 首页 index 67 | │ 68 | │ 69 | └─ config # node配置 node configuration 70 |71 | 72 | 73 | ## 教程 Tutorial 74 | 75 | [http://answershuto.github.io/jekyll/update/2016/10/25/养只爬虫当宠物-Node.js爬虫爬取58同城租房信息.html](http://answershuto.github.io/jekyll/update/2016/10/25/养只爬虫当宠物-Node.js爬虫爬取58同城租房信息.html) 76 | -------------------------------------------------------------------------------- /app.js: -------------------------------------------------------------------------------- 1 | var express = require('./config/express'); 2 | 3 | var app = express(); 4 | 5 | 6 | module.exports = app; -------------------------------------------------------------------------------- /app/controllers/rental.server.controller.js: -------------------------------------------------------------------------------- 1 | var http = require('http'); 2 | var cheerio = require('cheerio'); 3 | var cfg = require('../../config/config') 4 | 5 | /*url存储对象*/ 6 | let rentalObj = (function(){ 7 | /*保存58同城上爬取的每个租房的URL*/ 8 | let rentalSet = new Set(); 9 | 10 | /*增加URL后的回调函数*/ 11 | let callBackFunc = function(){}; 12 | 13 | return { 14 | add(data){ 15 | if (data.indexOf('hz.58.com') < 0) return;/*暂时屏蔽会跳转的URL*/ 16 | rentalSet.add(data); 17 | callBackFunc && callBackFunc(data); 18 | }, 19 | 20 | register(func){ 21 | callBackFunc = func; 22 | }, 23 | 24 | unRegister(){ 25 | callBackFunc = function(){}; 26 | } 27 | } 28 | })(); 29 | 30 | /*url读取解析对象*/ 31 | let rentalInfosObj = (function(){ 32 | /*二手房解析出来的数据存储在该map中*/ 33 | let rentalInfosMap = new Map(); 34 | 35 | let szUrlPipe = [];/*管道数组,将得到url压入,由定时器按时读取访问解析。*/ 36 | 37 | let iNum = 0; 38 | (function func(){ 39 | iNum++; 40 | 41 | /*反爬虫策略:1-10s随机访问,每8次休息一次,休息时间为3-9分钟随机*/ 42 | if (iNum > 8) { 43 | setTimeout(func,10000 * (1+Math.random())); 44 | if (iNum === 40) { 45 | iNum = 0; 46 | }; 47 | } 48 | else{ 49 | if (szUrlPipe.length) { 50 | analysis(szUrlPipe.shift()); 51 | }; 52 | setTimeout(func,10000 * Math.random()); 53 | } 54 | })(); 55 | 56 | 57 | /*根据url访问并解析返回值*/ 58 | function analysis(url){ 59 | let html = ''; 60 | http.get(url, function(res){ 61 | res.on('data', function(chuck){ 62 | html += chuck; 63 | }) 64 | 65 | res.on('end', function(){ 66 | let $ = cheerio.load(html); 67 | try{ 68 | 69 | $('a.c_333') && $('a.c_333')['0'] 70 | && rentalInfosMap.set(url, { 71 | tel: $('span.tel-num.tel-font').text(), 72 | price: $('.house-price').text(), 73 | location: $('a.c_333')[0].children[0].data, 74 | img: $('#smainPic')['0'].attribs.src, 75 | }) 76 | 77 | console.log('get '+Array.from(rentalInfosMap).length+' rental infos'); 78 | } 79 | catch(e){ 80 | console.log('get rental infos or rentalInfosMap set error!'); 81 | } 82 | }) 83 | }) 84 | } 85 | 86 | return { 87 | push(url){ 88 | szUrlPipe.push(url); 89 | }, 90 | getRentalInfos(){ 91 | let params = {}; 92 | 93 | for(let [k,v] of rentalInfosMap){ 94 | params[k] = v; 95 | } 96 | 97 | return params; 98 | } 99 | } 100 | })(); 101 | 102 | /*********************************************************************************************** 103 | *函数名 getUrl 104 | *函数功能描述 :根据第几页获取url 105 | *函数参数 :page:页码 106 | *函数返回值 :url 107 | ***********************************************************************************************/ 108 | function getUrl(page = 1){ 109 | return 'http://hz.58.com/chuzu/pn'+page+'/?key=%E6%9D%AD%E5%B7%9E%E7%A7%9F%E6%88%BF%E5%AD%90&cmcskey=%E7%A7%9F%E6%88%BF%E5%AD%90&final=1&PGTID=0d3090a7-0004-f43c-ee04-95c2ea3d031f&ClickID=6'; 110 | } 111 | 112 | /*********************************************************************************************** 113 | *函数名 updateRentalUrl 114 | *函数功能描述 :从58网站上更新租房信息 115 | *函数参数 :无 116 | *函数返回值 :无 117 | ***********************************************************************************************/ 118 | function updateRentalUrl(){ 119 | for(let page=1;page<=cfg.page;page++){ 120 | let html = ''; 121 | 122 | http.get(getUrl(page), function(res){ 123 | res.on('data', function(chuck){ 124 | html += chuck; 125 | }) 126 | 127 | res.on('end', function(){ 128 | let $ = cheerio.load(html); 129 | let arrRentals = $('.tbimg')[0]; 130 | for(let i = 0; i < $('div.des > h2 > a').length; i++){ 131 | rentalObj.add($('div.des > h2 > a')[i].attribs.href) 132 | } 133 | }) 134 | }) 135 | } 136 | } 137 | 138 | /*********************************************************************************************** 139 | *函数名 getRentalInfosByUrl 140 | *函数功能描述 :根据URL获取租房信息 141 | *函数参数 :url:每条租房信息的URL 142 | *函数返回值 :无 143 | ***********************************************************************************************/ 144 | function getRentalInfosByUrl(url){ 145 | rentalInfosObj.push(url); 146 | } 147 | 148 | 149 | 150 | module.exports = { 151 | init(){ 152 | updateRentalUrl(); 153 | rentalObj.register(getRentalInfosByUrl); 154 | }, 155 | 156 | getRentalInfos(req, res, next){ 157 | let params = rentalInfosObj.getRentalInfos(); 158 | console.log('params',params) 159 | res.json({result: true,params}); 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /app/models/rental.server.model.js: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /app/routes/rental.server.routes.js: -------------------------------------------------------------------------------- 1 | var RentalController = require('../controllers/rental.server.controller'); 2 | 3 | module.exports = function(app){ 4 | app.route('/') 5 | .get(function(req,res,next){ 6 | res.sendFile('index.html'); 7 | }); 8 | 9 | app.route('/rental/getInfos') 10 | .all(RentalController.getRentalInfos); 11 | } -------------------------------------------------------------------------------- /bin/www: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var app = require('../app') 4 | var config = require('../config/config'); 5 | 6 | app.listen(config.port, function(){ 7 | console.log('app started,listen on port:',config.port); 8 | }) -------------------------------------------------------------------------------- /config/config.js: -------------------------------------------------------------------------------- 1 | var config = null; 2 | 3 | if(process && process.env && process.env.NODE_ENV){ 4 | config = require('./env/' + process.env.NODE_ENV); 5 | } 6 | else{ 7 | /*开发环境*/ 8 | config = require('./env/development.js'); 9 | } 10 | 11 | module.exports = config; -------------------------------------------------------------------------------- /config/env/development.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | port: 2000, 3 | mongodb: 'mongodb://localhost/scms', 4 | page: 10, /*解析几页的租房数据*/ 5 | } -------------------------------------------------------------------------------- /config/express.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var bodyParser = require('body-parser'); 3 | var controller = require('../app/controllers/rental.server.controller'); 4 | 5 | module.exports = function(){ 6 | console.log('init express..'); 7 | var app = express(); 8 | 9 | app.set('view engine','ejs'); 10 | app.use(express.static(__dirname+'/../views')); 11 | 12 | app.use(bodyParser.json()); 13 | 14 | require('../app/routes/rental.server.routes')(app); 15 | controller.init(); 16 | 17 | app.use(function(req, res, next){ 18 | res.status(404); 19 | try{ 20 | return res.json('No Found!'); 21 | } 22 | catch(e){ 23 | console.error('404 set header after send.'); 24 | } 25 | }) 26 | 27 | app.use(function(err, req, res, next){ 28 | if (!err) { 29 | return next(); 30 | }; 31 | 32 | res.status(500); 33 | try{ 34 | return res.json(err.message || "server err"); 35 | } 36 | catch(e){ 37 | console.error('500 set header after send.') 38 | } 39 | }); 40 | 41 | return app; 42 | }; -------------------------------------------------------------------------------- /config/mongoose.js: -------------------------------------------------------------------------------- 1 | var config = require('./config') 2 | 3 | module.exports = function(){ 4 | 5 | require('../app/models/rental.server.model.js'); 6 | 7 | return db; 8 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "rental", 3 | "version": "1.0.0", 4 | "description": "nodejs Project module", 5 | "main": "bin/www", 6 | "scripts": { 7 | "start": "babel-node bin/www", 8 | "test": "echo \"Error: no test specified\" && exit 1" 9 | }, 10 | "keywords": [], 11 | "author": "cao yang", 12 | "license": "ISC", 13 | "dependencies": { 14 | "body-parser": "^1.15.1", 15 | "cheerio": "^0.22.0", 16 | "express": "^4.13.4", 17 | "http": "0.0.0", 18 | "request": "^2.75.0" 19 | }, 20 | "devDependencies": { 21 | "babel-cli": "^6.16.0", 22 | "babel-preset-es2015": "^6.16.0" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /views/css/index.css: -------------------------------------------------------------------------------- 1 | html{ 2 | height:100% 3 | } 4 | 5 | body{ 6 | height:100%;margin:0px;padding:0px 7 | } 8 | 9 | #container{ 10 | height:100% 11 | } 12 | 13 | .showImg{ 14 | width: 200px; 15 | height: 160px; 16 | margin: 20px auto; 17 | cursor: pointer; 18 | } -------------------------------------------------------------------------------- /views/image/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answershuto/Rental/be282448772bd55d6ddac21debdef46f7d580329/views/image/1.png -------------------------------------------------------------------------------- /views/image/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answershuto/Rental/be282448772bd55d6ddac21debdef46f7d580329/views/image/2.png -------------------------------------------------------------------------------- /views/image/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answershuto/Rental/be282448772bd55d6ddac21debdef46f7d580329/views/image/3.png -------------------------------------------------------------------------------- /views/image/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answershuto/Rental/be282448772bd55d6ddac21debdef46f7d580329/views/image/4.png -------------------------------------------------------------------------------- /views/image/house.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/answershuto/Rental/be282448772bd55d6ddac21debdef46f7d580329/views/image/house.png -------------------------------------------------------------------------------- /views/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |