├── README.md ├── config.js ├── import.js ├── last-repair-time.txt ├── models ├── index.js └── resource.js ├── package.json └── repair.js /README.md: -------------------------------------------------------------------------------- 1 | # mongodb to elasticsearch 2 | 3 | http://findit.keenwon.com 用到的一个小工具,用来将数据从mongodb导入到elasticsearch 4 | 5 | ### 使用方法 6 | 7 | * 安装依赖包`npm install` 8 | * 修改`config.js`中的Elasticsearch的ip和端口,`models/index.js`中mongodb的配置 9 | * 导入数据的话,执行`node import.js`,补充索引新数据,执行`node repair.js` 10 | 11 | ### 注意事项 12 | 13 | * 这是个**简单**的小工具,为了保证数据的完整性,要停止mongodb的写入操作 14 | * `repair.js` 是补充新数据的,上次导入的时间点记在`last-repair-time.txt`里面,执行`node repair.js`只会将该时间点之后的数据导入Elasticsearch,为了保证性能,建议在mongodb中,给`createTime`加索引 15 | 16 | 更多参看:[http://keenwon.com/1436.html](http://keenwon.com/1436.html "http://keenwon.com/1436.html") 17 | -------------------------------------------------------------------------------- /config.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | module.exports = { 4 | elasticsearchHost: '<你的Elasticsearch地址,例如127.0.0.1:9200>' 5 | }; 6 | -------------------------------------------------------------------------------- /import.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /** 4 | * 导入全部的数据 5 | */ 6 | 7 | var elasticsearch = require('elasticsearch'), 8 | index = require('./models/index'), 9 | Resource = index.Resource, 10 | config = require('./config'), 11 | client = new elasticsearch.Client({ 12 | host: config.elasticsearchHost, 13 | log: 'error' 14 | }); 15 | 16 | var _id = 0, 17 | count = 0; 18 | 19 | function run() { 20 | Resource.find({_id: {$gt: _id}}).select('_id n s t c').limit(10).exec(function (err, value) { 21 | if (err) { 22 | throw new Error(err); 23 | } 24 | 25 | // 输出信息 26 | console.log(_id + ' ' + count); 27 | 28 | if (value.length <= 0) { 29 | console.log('Done!'); 30 | return; 31 | } 32 | 33 | // 添加到elasticsearch 34 | client.bulk({ 35 | index: 'antcolony', 36 | type: 'resource', 37 | body: formatData(value) 38 | }, function (error, response) { 39 | if (error) { 40 | throw new Error(error); 41 | } 42 | 43 | // 继续下一组 44 | count += 10; 45 | _id = value[value.length - 1]._id; 46 | process.nextTick(run); 47 | }); 48 | }); 49 | } 50 | 51 | function formatData(data) { 52 | var result = []; 53 | for (var i = 0, j = data.length; i < j; i++) { 54 | var item = data[i].toJSON(); 55 | 56 | result.push({create: {_id: item._id}}); 57 | delete item._id; 58 | result.push(item); 59 | } 60 | return result; 61 | } 62 | 63 | console.log('Running......'); 64 | run(); -------------------------------------------------------------------------------- /last-repair-time.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /models/index.js: -------------------------------------------------------------------------------- 1 | var mongodbHost = '', 2 | mongodbPort = 27017, // mongodb端口号 3 | mongodbDatabase = '', 4 | mongodbUserName = '<用户名>', 5 | mongodbPassword = '<密码>'; 6 | 7 | var util = require('util'), 8 | mongoose = require('mongoose'), 9 | uri = util.format('mongodb://%s:%d/%s', mongodbHost, mongodbPort, mongodbDatabase); 10 | 11 | //mongoose.set('debug', true); 12 | 13 | mongoose.connect(uri, { 14 | user: mongodbUserName, 15 | pass: mongodbPassword 16 | }, function (err) { 17 | if (err) { 18 | console.error('connect to %s error: ', mongodbDatabase, err.message); 19 | process.exit(1); 20 | } 21 | }); 22 | 23 | mongoose.connection.on('error', function (err) { 24 | console.error('mongodb error: ' + err); 25 | }); 26 | 27 | // models 28 | require('./resource'); 29 | 30 | exports.Resource = mongoose.model('Resource'); -------------------------------------------------------------------------------- /models/resource.js: -------------------------------------------------------------------------------- 1 | var mongoose = require('mongoose'), 2 | Schema = mongoose.Schema; 3 | 4 | var Resource = new Schema({ 5 | 6 | // 设置_id为infohash 7 | _id: { type: String, required: true }, 8 | 9 | // name 资源名称 10 | n: { type: String, required: true }, 11 | 12 | // type 资源类型 13 | t: {type: String }, 14 | 15 | // size 资源总大小 16 | s: {type: Number }, 17 | 18 | // files 包含文件 19 | f: [ 20 | { 21 | _id: false, 22 | // name 文件名 23 | n: { type: String }, 24 | // size 文件大小 25 | s: { type: Number, default: 0 } 26 | } 27 | ], 28 | 29 | // hot 最新热度值 30 | h: { type: Number, default: 0 }, 31 | 32 | // hots 最近2周热度值, key:value 例如: 12-20:1000 33 | hs: [ 34 | { 35 | _id: false, 36 | // 时间 37 | t: { type: String }, 38 | // 热度值 39 | v: { type: Number, default: 0 } 40 | } 41 | ], 42 | 43 | // createDate 收录时间 44 | c: { type: Date, default: Date.now }, 45 | 46 | // updateDate 更新时间 47 | u: { type: Date, default: Date.now }, 48 | 49 | // disable 是否被禁用 50 | d: { type: Boolean } 51 | 52 | }); 53 | 54 | mongoose.model('Resource', Resource); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "mongodb-to-elasticsearch", 3 | "version": "1.0.0", 4 | "description": "a tool help you import data from mongodb to elasticsearch.", 5 | "author": { 6 | "name": "keenwon", 7 | "email": "semanwmj@yeah.net" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git://github.com/keenwon/mongodb-to-elasticsearch.git" 12 | }, 13 | "license": "ISC", 14 | "dependencies": { 15 | "mongoose": "^3.8.17", 16 | "elasticsearch": "^2.4.3", 17 | "moment": "^2.8.3" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /repair.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var fs = require('fs'), 4 | path = require('path'), 5 | filePath = path.join(__dirname, 'last-repair-time.txt'), 6 | elasticsearch = require('elasticsearch'), 7 | index = require('./models/index'), 8 | Resource = index.Resource, 9 | config = require('./config'), 10 | moment = require('moment'), 11 | client = new elasticsearch.Client({ 12 | host: config.elasticsearchHost, 13 | log: 'error' 14 | }); 15 | 16 | var createTime = fs.readFileSync(filePath).toString(), 17 | count = 0; 18 | 19 | if (!createTime) { 20 | throw new Error('createTime is required'); 21 | } 22 | 23 | createTime = moment.utc(createTime).toDate(); 24 | 25 | function run() { 26 | Resource.find({c: {$gt: createTime}}).select('_id n s t c').sort({'c': 1}).limit(10).exec(function (err, value) { 27 | if (err) { 28 | throw new Error(err); 29 | } 30 | 31 | // 输出信息 32 | console.log(createTime + ' ' + count); 33 | 34 | if (value.length <= 0) { 35 | console.log('Done!'); 36 | fs.writeFileSync(filePath,moment.utc(createTime).subtract(5, 'minute').format('YYYY-MM-DD HH:mm:ss')); 37 | return; 38 | } 39 | // 添加到elasticsearch 40 | client.bulk({ 41 | index: 'antcolony', 42 | type: 'resource', 43 | body: formatData(value) 44 | }, function (error, response) { 45 | if (error) { 46 | throw new Error(error); 47 | } 48 | 49 | // 继续下一组 50 | count += 10; 51 | createTime = moment.utc(value[value.length - 1].c).toDate(); 52 | process.nextTick(run); 53 | }); 54 | }); 55 | } 56 | 57 | function formatData(data) { 58 | var result = []; 59 | for (var i = 0, j = data.length; i < j; i++) { 60 | var item = data[i].toJSON(); 61 | 62 | result.push({create: {_id: item._id}}); 63 | delete item._id; 64 | result.push(item); 65 | } 66 | return result; 67 | } 68 | 69 | console.log('Running......'); 70 | run(); 71 | --------------------------------------------------------------------------------