├── .gitignore ├── Procfile ├── README.md ├── app.js ├── config.js ├── crawler.js ├── lib └── douban.js ├── model.js ├── package.json └── views ├── author.handlebars ├── home.handlebars ├── layouts └── main.handlebars ├── partials └── _post.handlebars └── posts.handlebars /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | web: NODE_ENV=production node app.js 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 目前程序部署在:https://haixiu.herokuapp.com/ 2 | 3 | 如果要本地搭建,就去看看 `config.js` 里面的配置。 4 | -------------------------------------------------------------------------------- /app.js: -------------------------------------------------------------------------------- 1 | /**! 2 | * haixiu - app.js 3 | * 4 | */ 5 | 6 | 'use strict'; 7 | 8 | /** 9 | * Module dependencies. 10 | */ 11 | const express = require('express'); 12 | const exphbs = require('express-handlebars'); 13 | const mongoose = require('mongoose'); 14 | const crawler = require('./crawler'); 15 | const model = require('./model'); 16 | const Post = model.Post; 17 | const config = require('./config'); 18 | 19 | mongoose.connect(config.mongodb_url); 20 | 21 | let app = express(); 22 | let hbs = exphbs.create({ 23 | defaultLayout: 'main', 24 | helpers: { 25 | gaid: function () { 26 | return config.gaid; 27 | }, 28 | }, 29 | }); 30 | app.engine('handlebars', hbs.engine); 31 | app.set('view engine', 'handlebars'); 32 | 33 | let cities = [ 34 | {key: 'hangzhou', name: '浙江杭州'}, 35 | {key: 'shanghai', name: '上海'}, 36 | {key: 'beijing', name: '北京'}, 37 | {key: 'chengdu', name: '四川成都'}, 38 | {key: 'nanning', name: '广西南宁'}, 39 | {key: 'changsha', name: '湖南长沙'}, 40 | {key: 'changsanjiao', name: '长三角', names: [ 41 | '浙江杭州', '浙江温州', '浙江宁波', '浙江台州', 42 | '浙江嘉兴', '浙江金华', '浙江绍兴', '浙江湖州', 43 | '浙江丽水', '浙江衢州', '浙江舟山', '上海', '江苏南京' 44 | ] 45 | }, 46 | {key: 'guangzhou', name: '广东广州'}, 47 | {key: 'shenzhen', name: '广东深圳'}, 48 | ]; 49 | 50 | function fixImages(imgs) { 51 | imgs = imgs || []; 52 | return imgs.map(function (img) { 53 | if (img && img.startsWith('https://')) { 54 | img = img.replace('https://', 'http://'); 55 | } 56 | img = img.replace('.doubanio.com', '.douban.com'); 57 | return img; 58 | }); 59 | } 60 | 61 | function fixDocs(docs) { 62 | docs = docs || []; 63 | return docs.map(function (doc) { 64 | doc.imgs = fixImages(doc.imgs); 65 | return doc; 66 | }); 67 | } 68 | 69 | app.get('/', function (req, res, next) { 70 | res.render('home', {cities: cities}); 71 | }); 72 | 73 | // 针对各个地域的 route 配置 74 | 75 | app.get('/all', function (req, res, next) { 76 | Post.find().sort({id: -1}).limit(100).exec(function (err, docs) { 77 | if (err) { 78 | return next(err); 79 | } 80 | docs = fixDocs(docs); 81 | res.render('posts', {docs: docs}); 82 | }); 83 | }); 84 | 85 | for (let i = 0; i < cities.length; i++) { 86 | (function (city) { 87 | let names = city.names || [city.name]; 88 | app.get('/city/' + city.key, function (req, res, next) { 89 | Post.find({author_location: {$in: names}}).sort({id: -1}).limit(100).exec(function (err, docs) { 90 | if (err) { 91 | return next(err); 92 | } 93 | docs = fixDocs(docs); 94 | res.render('posts', {docs: docs}); 95 | }); 96 | }); 97 | })(cities[i]); 98 | } 99 | 100 | // END 针对各个地域的 route 配置 101 | 102 | // 某个用户的发帖 103 | app.get('/author/:authorId', function (req, res, next) { 104 | const authorId = req.params.authorId; 105 | Post.find({author_id: authorId}).sort({id: -1}).limit(100).exec(function (err, docs) { 106 | if (err) { 107 | return next(err); 108 | } 109 | let authorName = ''; 110 | if (docs && docs.length) { 111 | // 取最近一条帖子的昵称 112 | authorName = docs[0].author_name; 113 | } 114 | docs = fixDocs(docs); 115 | res.render('author', { 116 | authorId: authorId, 117 | authorName: authorName, 118 | docs: docs, 119 | }); 120 | }); 121 | }); 122 | 123 | // 启动爬虫 124 | crawler.start(); 125 | 126 | let server = app.listen(config.port, function () { 127 | console.log('app is listening ' + server.address().port); 128 | }); 129 | -------------------------------------------------------------------------------- /config.js: -------------------------------------------------------------------------------- 1 | /**! 2 | * haixiu - config.js 3 | * 4 | */ 5 | 6 | 'use strict'; 7 | 8 | /** 9 | * Module dependencies. 10 | */ 11 | let config = { 12 | mongodb_url: process.env.MONGOHQ_URL || 'mongodb://127.0.0.1/haixiu', 13 | port: process.env.PORT || 27017, 14 | apikey: process.env.DB_APIKEY || '', 15 | groupName: 'haixiuzu', 16 | fetchPage: 20, // 抓取最新20页数据 17 | }; 18 | 19 | exports = module.exports = config; 20 | -------------------------------------------------------------------------------- /crawler.js: -------------------------------------------------------------------------------- 1 | /**! 2 | * haixiu - crawler.js 3 | * 4 | */ 5 | 6 | 'use strict'; 7 | 8 | /** 9 | * Module dependencies. 10 | */ 11 | const Douban = require('./lib/douban'); 12 | const config = require('./config'); 13 | const model = require('./model'); 14 | const _ = require('lodash'); 15 | const co = require('co'); 16 | 17 | const DB = new Douban({ 18 | apikey: config.apikey, 19 | }); 20 | 21 | const Post = model.Post; 22 | 23 | function onerror(err) { 24 | console.error(err.stack); 25 | console.log(err); 26 | } 27 | 28 | function* handleTopic(topic) { 29 | topic = topic || {}; 30 | let topicId = topic.id; 31 | let imgs = _.pluck(topic.photos, 'alt'); 32 | 33 | let exists = yield Post.findOne({id: topicId}).exec(); 34 | if (exists) { 35 | imgs = _.union(imgs, exists.imgs); 36 | } 37 | let post = { 38 | id: topicId, 39 | url: `http://www.douban.com/group/topic/${topicId}/`, 40 | title: topic.title, 41 | imgs: imgs, 42 | author_id: topic.authorInfo.id, 43 | author_name: topic.authorInfo.name, 44 | author_url: topic.authorInfo.alt, 45 | author_location: topic.authorInfo.loc_name || '', 46 | update_at: new Date(), 47 | }; 48 | return yield Post.update({id: topicId}, post, {upsert: true}).exec(); 49 | } 50 | 51 | function fetchHaixiuzu() { 52 | co(function* () { 53 | for (let page = 1; page <= config.fetchPage; page++) { 54 | let topics = DB.groupTopic(config.groupName, page); 55 | for (let i = 0; i < topics.length; i++) { 56 | let topic = topics[i]; 57 | topic.authorInfo = DB.user((topic.author || {}).id); 58 | yield handleTopic(topic); 59 | } 60 | } 61 | }).catch(onerror); 62 | } 63 | 64 | exports.start = function () { 65 | fetchHaixiuzu(); 66 | 67 | // 每10分钟运行一次 68 | setInterval(fetchHaixiuzu, 10 * 60 * 1000); 69 | }; 70 | -------------------------------------------------------------------------------- /lib/douban.js: -------------------------------------------------------------------------------- 1 | /**! 2 | * haixiu - douban.js 3 | * 4 | * Authors: 5 | * rockdai 6 | */ 7 | 8 | 'use strict'; 9 | 10 | /** 11 | * Module dependencies. 12 | */ 13 | const querystring = require('querystring'); 14 | const request = require('urllib-sync').request; 15 | 16 | const API_ROOT = 'https://api.douban.com/v2'; 17 | 18 | /** 19 | * Expose `Client` 20 | */ 21 | 22 | module.exports = Client; 23 | 24 | function Client(options) { 25 | if (!(this instanceof Client)) { 26 | return new Client(options); 27 | } 28 | 29 | options = options || {}; 30 | this.apikey = options.apikey; 31 | this.timeout = options.timeout || 30000; 32 | } 33 | 34 | Client.prototype.getUrl = function(path, query) { 35 | let result = API_ROOT + path; 36 | query = query || {}; 37 | if (this.apikey) { 38 | query.apikey = this.apikey; 39 | } 40 | result = result + '?' + querystring.stringify(query); 41 | return result; 42 | }; 43 | 44 | Client.prototype.request = function (url, args) { 45 | 46 | args = args || {}; 47 | args.timeout = this.timeout; 48 | 49 | let result = request(url, args); 50 | 51 | let body = result.data.toString(); 52 | let status = result.status; 53 | let headers = result.headers; 54 | if (status.toString()[0] !== '2') { 55 | let err = new Error('Request Douban API error.'); 56 | err.name = 'RequestDoubanAPIError'; 57 | err.statusCode = status; 58 | err.originHeaders = headers; 59 | err.originBody = body; 60 | throw err; 61 | } 62 | let jsonBody; 63 | try { 64 | jsonBody = JSON.parse(body); 65 | } catch (ex) { 66 | ex.name = 'ParseDoubanAPIFailed'; 67 | ex.statusCode = status; 68 | ex.originHeaders = headers; 69 | ex.originBody = body; 70 | throw ex; 71 | } 72 | return jsonBody; 73 | }; 74 | 75 | Client.prototype.user = function (userId) { 76 | let url = this.getUrl(`/user/${userId}`); 77 | let body = this.request(url); 78 | return body; 79 | }; 80 | 81 | Client.prototype.groupTopic = function (groupName, page) { 82 | page = page || 1; 83 | let start = (page - 1) * 20; 84 | let url = this.getUrl(`/group/${groupName}/topics`, { 85 | start: start, 86 | }); 87 | let body = this.request(url); 88 | let topics = body.topics || []; 89 | return topics; 90 | }; 91 | -------------------------------------------------------------------------------- /model.js: -------------------------------------------------------------------------------- 1 | /**! 2 | * haixiu - model.js 3 | * 4 | */ 5 | 6 | 'use strict'; 7 | 8 | /** 9 | * Module dependencies. 10 | */ 11 | const mongoose = require('mongoose'); 12 | const Schema = mongoose.Schema; 13 | 14 | let PostSchema = new Schema({ 15 | id: String, 16 | url: String, 17 | title: String, 18 | imgs: [String], 19 | author_id: String, 20 | author_name: String, 21 | author_url: String, 22 | author_location: String, 23 | create_at: { type: Date, default: Date.now }, 24 | update_at: { type: Date, default: Date.now }, 25 | }); 26 | 27 | PostSchema.index({id: -1}, { unique: true }); 28 | PostSchema.index({create_at: -1}); 29 | 30 | let Post = mongoose.model('Post', PostSchema); 31 | 32 | exports.Post = Post; 33 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "haixiu", 3 | "version": "0.2.1", 4 | "description": "", 5 | "main": "app.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "", 10 | "license": "MIT", 11 | "dependencies": { 12 | "co": "^4.6.0", 13 | "urllib-sync": "~1.1.2", 14 | "express": "^4.10.1", 15 | "express-handlebars": "^1.1.0", 16 | "lodash": "^4.17.19", 17 | "mongoose": "^4.2.4" 18 | }, 19 | "engines": { 20 | "node": "4.2.1" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /views/author.handlebars: -------------------------------------------------------------------------------- 1 |

{{authorName}}

2 | {{#each docs}} 3 | {{> _post this}} 4 | {{/each}} 5 | -------------------------------------------------------------------------------- /views/home.handlebars: -------------------------------------------------------------------------------- 1 |

豆瓣小组 - 请不要嘿咻

3 |

选择地域:

4 |

所有

5 | {{#each cities}} 6 |

{{name}}

7 | {{/each}} 8 | 9 | -------------------------------------------------------------------------------- /views/layouts/main.handlebars: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 害羞组爬虫 11 | 16 | 17 | 18 | 19 | {{{body}}} 20 | 21 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /views/partials/_post.handlebars: -------------------------------------------------------------------------------- 1 |

标题:《{{title}}》 {{url}}

2 |

创建时间:{{ create_at }}

3 |

作者:{{author_name}}

4 |

地址:{{author_location}}

5 | {{#each imgs}} 6 | 7 | {{/each}} 8 |
9 | -------------------------------------------------------------------------------- /views/posts.handlebars: -------------------------------------------------------------------------------- 1 | {{#each docs}} 2 | {{> _post this}} 3 | {{/each}} 4 | --------------------------------------------------------------------------------