├── .gitignore ├── src ├── config.js ├── crawl.js ├── index.js ├── htmlParse.js └── db.js ├── README.md └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules -------------------------------------------------------------------------------- /src/config.js: -------------------------------------------------------------------------------- 1 | 2 | const DB_CONNECT_STR = 'mongodb://localhost:27017/house_leasing'; 3 | const REQ_OPTIONS = { 4 | uri: 'https://www.douban.com/group/shanghaizufang/', 5 | headers: { 6 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36' 7 | } 8 | } 9 | 10 | 11 | exports.DB_CONNECT_STR = DB_CONNECT_STR; 12 | exports.REQ_OPTIONS = REQ_OPTIONS; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | doubanzufang 2 | 3 | 4 | What doubanzufang: 5 | -------- 功能 --------- 6 | 1. 利用node 爬取豆瓣上海租房小组信息(douban.com/group/shanghaizufang/) 7 | 8 | ---------------------- 9 | 10 | -------- 目的 -------- 11 | 1. 学习 node 12 | --------------------- 13 | 14 | 15 | Installation and dependencies: 16 | 1. npm install 17 | 18 | 19 | 20 | 21 | License: 22 | doubanzufang may be freely distributed under the MIT license. -------------------------------------------------------------------------------- /src/crawl.js: -------------------------------------------------------------------------------- 1 | const request = require('request-promise'); 2 | const reqOption = require('./config').REQ_OPTIONS; 3 | const parse = require('./htmlParse').parse; 4 | const insertData = require('./db').insertHouse; 5 | 6 | const crawlPage = () => { 7 | request(reqOption) 8 | .then(html => { 9 | return parse(html); 10 | }) 11 | .then(houseList => { 12 | insertData(houseList); 13 | }) 14 | .catch(err => {console.error(err)}); 15 | } 16 | 17 | exports.crawlPage = crawlPage; -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | const schedule = require('node-schedule'); 2 | const crawlPage = require('./crawl').crawlPage; 3 | const selectAllHouseId = require('./db').selectAllHouseId; 4 | 5 | // const scheduleRule = new schedule.RecurrenceRule(); 6 | // // 1, 8~24 每小时整点执行一次 7 | // scheduleRule.hour = [1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] 8 | // scheduleRule.minute = 0; 9 | // schedule.scheduleJob(scheduleRule, ()=> { 10 | // console.log('定时任务:' + new Date()) 11 | // }) 12 | 13 | 14 | selectAllHouseId().then((ids) => { 15 | const arr = ids.map((idObj, index) => { 16 | return idObj.id; 17 | }) 18 | console.log(arr); 19 | }, (err) => { 20 | console.error(err); 21 | }) 22 | // crawlPage(); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "doubanzufang", 3 | "version": "1.0.0", 4 | "description": "collection house-leasing data of douban", 5 | "main": "request.js", 6 | "dependencies": { 7 | "cheerio": "^1.0.0-rc.2", 8 | "mongodb": "^3.0.6", 9 | "node-schedule": "^1.3.0", 10 | "request-promise": "^4.2.2" 11 | }, 12 | "devDependencies": {}, 13 | "scripts": { 14 | "test": "echo \"Error: no test specified\" && exit 1" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/codeDebugTest/doubanzufang.git" 19 | }, 20 | "keywords": [ 21 | "node" 22 | ], 23 | "author": "orlando liu", 24 | "license": "MIT", 25 | "bugs": { 26 | "url": "https://github.com/codeDebugTest/doubanzufang/issues" 27 | }, 28 | "homepage": "https://github.com/codeDebugTest/doubanzufang#readme" 29 | } 30 | -------------------------------------------------------------------------------- /src/htmlParse.js: -------------------------------------------------------------------------------- 1 | const cheerio = require('cheerio'); 2 | 3 | const CurrentYear = (new Date()).getFullYear(); 4 | 5 | const getID = (url) => { 6 | // e.g: https://www.douban.com/people/159176019/ 7 | const splited = url.split('/'); 8 | return splited[splited.length -2]; 9 | } 10 | 11 | const getTitleInfo = ($, dom) => { 12 | const url = $(dom).find('a').attr('href'); 13 | 14 | return { 15 | title: $(dom).find('a').attr('title'), 16 | url: url, 17 | id: getID(url) 18 | } 19 | } 20 | 21 | const getPublisher = ($, dom) => { 22 | const url = $(dom).find('a').attr('href') 23 | 24 | return { 25 | name: $(dom).find('a').text(), 26 | id: getID(url), 27 | } 28 | } 29 | 30 | const exchangToJsObject = ($, dom) => { 31 | const topic = {}; 32 | $(dom).find('td').map(function(i, e) { 33 | if (i == 0) { 34 | const titleInfo = getTitleInfo($, this); 35 | Object.assign(topic, titleInfo); 36 | } else if (i == 1) { 37 | topic['publisher'] = getPublisher($, this); 38 | } else if (i == 2) { 39 | topic['responseAmount'] = $(this).html(); 40 | } else if (i == 3) { 41 | topic['lastResponseTime'] = CurrentYear + '-' + $(this).html(); 42 | } 43 | }); 44 | return topic; 45 | } 46 | 47 | const parse = (html) => { 48 | $ = cheerio.load(html); 49 | const houseArr = []; 50 | 51 | $('table.olt').children().find('tr').map(function(index, el) { 52 | //去除表头 53 | if (index == 0) { 54 | return true; 55 | } 56 | 57 | houseArr.push(exchangToJsObject($, this)); 58 | }) 59 | return houseArr; 60 | }; 61 | 62 | exports.parse = parse; -------------------------------------------------------------------------------- /src/db.js: -------------------------------------------------------------------------------- 1 | const MongoClient = require('mongodb').MongoClient; 2 | const assert = require('assert'); 3 | 4 | const DB_URL = require('./config').DB_CONNECT_STR; 5 | const DB_NAME = 'house_leasing'; 6 | 7 | const _throwDBError = (action='操作数据库', err) => { 8 | console.log('------- '+ action + '失败 --------'); 9 | throw 'DB ERROR: ' + err; 10 | }; 11 | 12 | const _dbCallbackFactory = (action, callback) => { 13 | return (err, result) => { 14 | assert.equal(err, null); 15 | console.log(action + ' successful.'); 16 | callback && callback(result); 17 | } 18 | } 19 | 20 | const insertHouse = (data) => { 21 | MongoClient.connect(DB_URL, (err, client) => { 22 | assert.equal(null, err); 23 | console.log('------- connected db successfully. --------'); 24 | 25 | const db = client.db(DB_NAME); 26 | const collection = db.collection("house_topic"); 27 | collection.insertMany(data, (err, result) => { 28 | assert.equal(null, err); 29 | assert.equal(data.length, result.result.n); 30 | assert.equal(data.length, result.ops.length); 31 | console.log("-------- insert data successfully. ----------"); 32 | client.close(); 33 | }); 34 | }) 35 | } 36 | 37 | const selectAllHouse = () => { 38 | MongoClient.connect(DB_URL, (err,client) => { 39 | assert.equal(null, err); 40 | console.log('------- connected db successfully. --------'); 41 | const db = client.db(DB_NAME); 42 | const collection = db.collection("house_topic"); 43 | collection.find({}).toArray((err, houseList) => { 44 | assert(err, null); 45 | console.log('------- found following records --------'); 46 | console.log(houseLIst); 47 | }) 48 | }) 49 | } 50 | 51 | const selectAllHouseId = async () => { 52 | try { 53 | const client = await MongoClient.connect(DB_URL); 54 | console.log("Connected correctly to server"); 55 | 56 | const collection = client.db(DB_NAME).collection("house_topic"); 57 | let result = collection.find({}).project({id:1}).toArray(); 58 | 59 | client.close(); 60 | return result; 61 | } catch (err) { 62 | throw new Error(err); 63 | }; 64 | } 65 | 66 | 67 | exports.insertHouse = insertHouse; 68 | exports.selectAllHouseId = selectAllHouseId; --------------------------------------------------------------------------------