├── .gitignore ├── LICENSE ├── README.md ├── crawl.js ├── crawler.js ├── export.csv ├── export.js ├── package.json ├── rethinkdb.js └── trrnts.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | public 3 | .tmp 4 | .sass-cache 5 | .idea 6 | client/bower_components 7 | dist 8 | /server/config/local.env.js -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Max Mathys 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dht-crawler 2 | 3 | Minimal BitTorrent crawler and scheduler with RethinkDB backend to **collect, analyse and store** peers. 4 | 5 | This crawler searches for peers of one or more torrents you define. Its core, forked from [Trrnts](https://github.com/Trrnts/Trrnts), searches for peers via the [DHT-Protocol](http://www.bittorrent.org/beps/bep_0005.html) and sends ~4000 UDP packages per second. After, the crawler looks up the origin in a *GeoIP-database*, and stores it after. 6 | 7 | # The stack 8 | Node, trrnts (crawler core), RethinkDB 9 | 10 | # The crawler 11 | ## The DHT protocol implementation 12 | The BitTorrent DHT protocol is based on UDP, which is a connection-less, low-level protocol. The team from Trrnts made a pretty, straight-forward implementation of the protocol, however more abstract ways of DHT-crawling exist. 13 | 14 | ## Sample benchmark 15 | This is a benchmark of a popular torrent as listed on [kickass.to](http://kickass.to) 16 | ![Sample benchmark](http://i.imgur.com/YkgClkU.png) 17 | -------------------------------------------------------------------------------- /crawl.js: -------------------------------------------------------------------------------- 1 | var crawler = require('./crawler'); 2 | var db = require('./rethinkdb'); 3 | var parseMagnetURI = require('magnet-uri'); 4 | var _ = require('lodash'); 5 | 6 | var opt = require('node-getopt').create([ 7 | ['a' , 'add=ARG' , 'add magnet URI to crawler'], 8 | ['l' , 'list' , 'list the magnets which we want to parse'], 9 | ['r' , 'remove=ARG' , 'remove the magnet with its infoHash'], 10 | ['t' , 'time=ARG' , 'time for each crawl in s'], 11 | ['p' , 'parallel=ARG' , 'parallel crawls to perform'], 12 | ['i' , 'interval=ARG' , 'interval between the crawls in s'], 13 | ['o' , 'once' , 'once: don\'t repeat crawling, just crawl once.'], 14 | ['b' , 'benchmark' , 'benchmark: print found peers and nodes every 10s'], 15 | ['e' , 'export=ARG' , 'export all data in the database to CSV with path ARG'], 16 | ['h' , 'help' , 'display this help'] 17 | ]) // create Getopt instance 18 | .bindHelp() // bind option 'help' to default action 19 | .parseSystem(); // parse command line 20 | db.setup(); 21 | if(opt.options.add){ 22 | var parsedMagnetURI = {}; 23 | try { 24 | parsedMagnetURI = parseMagnetURI(opt.options.add); 25 | } catch (e) { } 26 | // Empty parsed object -> invalid magnet link! 27 | if (_.isEmpty(parsedMagnetURI)) { 28 | console.log('Invalid Magnet URI'); 29 | process.exit(1); 30 | } 31 | 32 | if(!parsedMagnetURI.name) parsedMagnetURI.name = "torrent"; 33 | db.insertMagnet(parsedMagnetURI.name, parsedMagnetURI.infoHash, true, function(err, success){ 34 | if(err){ 35 | console.log("Failed to insert Magnet"); 36 | process.exit(1); 37 | }else{ 38 | console.log("Added "+parsedMagnetURI.name+" "+parsedMagnetURI.infoHash); 39 | process.exit(0); 40 | } 41 | }); 42 | } else if(opt.options.remove){ 43 | var magnet = opt.options.remove.toLowerCase(); 44 | db.delete(magnet, function(err, res){ 45 | if(err) throw err; 46 | console.log("Removed: "+res.deleted); 47 | process.exit(0); 48 | }); 49 | } else if(opt.options.list){ 50 | db.getMagnets(function(err, res){ 51 | var c = 0; 52 | _.forEach(res, function(obj){ 53 | console.log((++c)+" Name: "+obj.name+"\n Info hash: "+obj.infoHash); 54 | }); 55 | if(err) throw err; 56 | process.exit(0); 57 | }) 58 | 59 | } else if(opt.options.export){ 60 | db.export(opt.options.export, function(err, ret){ 61 | if(err) throw err; 62 | else console.log("Succesfully exported csv to "+opt.options.export); 63 | process.exit(0); 64 | }); 65 | } else { 66 | if(!opt.options.time) 67 | opt.options.time = 20; 68 | if(!opt.options.parallel) 69 | opt.options.parallel = 4; 70 | if(!opt.options.once) 71 | opt.options.once = false; 72 | if(!opt.options.benchmark) 73 | opt.options.benchmark = false; 74 | 75 | 76 | db.getMagnets(function(err, res){ 77 | var magnets = res.length; 78 | var minInterval = Math.ceil(res.length/opt.options.parallel)*opt.options.time; 79 | if(!opt.options.interval && !opt.options.once){ 80 | opt.options.interval = minInterval; 81 | }else if(opt.options.interval", 6 | "dependencies": { 7 | "magnet-uri": "*", 8 | "bittorrent-dht": "mmathys/bittorrent-dht", 9 | "bencode": "^0.6.0", 10 | "hat": "0.0.3", 11 | "lodash": "^2.4.1", 12 | "compact2string": "^1.4.0", 13 | "redis": "^0.10.3", 14 | "request": "^2.37.0", 15 | "geoip-lite": "^1.1.5", 16 | "assert": "^1.3.0", 17 | "util": "^0.10.3", 18 | "rethinkdb": "^1.15.0-0", 19 | "debug": "^2.1.1", 20 | "node-getopt": "^0.2.3", 21 | "fast-csv": "^0.5.4" 22 | }, 23 | "license": "MIT" 24 | } 25 | -------------------------------------------------------------------------------- /rethinkdb.js: -------------------------------------------------------------------------------- 1 | // A fork of the [node.js chat app](https://github.com/eiriksm/chat-test-2k) 2 | // by [@orkj](https://twitter.com/orkj) using socket.io, rethinkdb, passport and bcrypt on an express app. 3 | // 4 | // See the [GitHub README](https://github.com/rethinkdb/rethinkdb-example-nodejs-chat/blob/master/README.md) 5 | // for details of the complete stack, installation, and running the app. 6 | 7 | var r = require('rethinkdb') 8 | , util = require('util') 9 | , assert = require('assert') 10 | , logdebug = require('debug')('rdb:debug') 11 | , logerror = require('debug')('rdb:error'); 12 | 13 | 14 | // #### Connection details 15 | 16 | // RethinkDB database settings. Defaults can be overridden using environment variables. 17 | var dbConfig = { 18 | host: process.env.RDB_HOST || 'localhost', 19 | port: parseInt(process.env.RDB_PORT) || 28015, 20 | db : process.env.RDB_DB || 'dht_crawler', 21 | tables: { 22 | 'crawls': 'id', 23 | 'crawl_queue': 'id', 24 | 'magnets': 'id' 25 | } 26 | }; 27 | 28 | module.exports.setup = function() { 29 | r.connect({host: dbConfig.host, port: dbConfig.port }, function (err, connection) { 30 | assert.ok(err === null, err); 31 | r.dbCreate(dbConfig.db).run(connection, function(err, result) { 32 | if(err) { 33 | logdebug("[DEBUG] RethinkDB database '%s' already exists (%s:%s)\n%s", dbConfig.db, err.name, err.msg, err.message); 34 | } 35 | else { 36 | logdebug("[INFO ] RethinkDB database '%s' created", dbConfig.db); 37 | } 38 | 39 | for(var tbl in dbConfig.tables) { 40 | (function (tableName) { 41 | r.db(dbConfig.db).tableCreate(tableName, {primaryKey: dbConfig.tables[tbl]}).run(connection, function(err, result) { 42 | if(err) { 43 | logdebug("[DEBUG] RethinkDB table '%s' already exists (%s:%s)\n%s", tableName, err.name, err.msg, err.message); 44 | } 45 | else { 46 | logdebug("[INFO ] RethinkDB table '%s' created", tableName); 47 | } 48 | }); 49 | })(tbl); 50 | } 51 | }); 52 | }); 53 | }; 54 | 55 | 56 | /** 57 | * Get all magnets in the table 58 | * 59 | * @param {Function} callback 60 | * callback invoked after collecting all the results 61 | * 62 | * @returns {Array} 63 | */ 64 | module.exports.getMagnets = function (callback) { 65 | onConnect(function (err, connection) { 66 | r.db(dbConfig['db']) 67 | .table('magnets') 68 | .run(connection, function(err, cursor){ 69 | if(err) throw err; 70 | cursor.toArray(function(err, result) { 71 | if(err) throw err; 72 | callback(err, result); 73 | }) 74 | }); 75 | }); 76 | }; 77 | 78 | /** 79 | * Insert a new magnet to crawl. 80 | * 81 | * RethinkDB will use the primary key index to fetch the result. 82 | * 83 | * @param {String} name 84 | * The description of the torrent. optional 85 | * 86 | * @param {String} infoHash 87 | * The infoHash of the torrent. 88 | * 89 | * @param {Boolean} shouldCrawl 90 | * Whether the torrent should be crawled. 91 | * 92 | * @param {Function} callback 93 | * callback invoked after collecting all the results 94 | * 95 | * @returns {Object} the user if found, `null` otherwise 96 | */ 97 | module.exports.insertMagnet = function (name, infoHash, shouldCrawl, callback) { 98 | onConnect(function (err, connection) { 99 | r.db(dbConfig['db']) 100 | .table('magnets') 101 | .insert( 102 | { 103 | name: name, 104 | infoHash: infoHash, 105 | shouldCrawl: shouldCrawl 106 | }).run(connection, function(err, result){ 107 | if(err) throw err; 108 | callback(null, []); 109 | }); 110 | }); 111 | } 112 | 113 | /** 114 | * Insert a new crawl result 115 | * 116 | * @param {String} infoHash 117 | * The infoHash of the torrent. 118 | * 119 | * @param {Number} time 120 | * The time in ms when the crawl was finished 121 | * 122 | * @param {Array} peers 123 | * The peers. includes ip addr and geoloc: ip, country, region, city, ll 124 | * 125 | * @param {Function} callback 126 | * callback invoked after collecting all the results 127 | * 128 | * @returns {Array} if success, if not null. 129 | */ 130 | module.exports.insertCrawl = function (infoHash, time, peers, callback) { 131 | onConnect(function (err, connection) { 132 | r.db(dbConfig['db']).table('crawls').insert([ 133 | { 134 | time: time, 135 | infoHash: infoHash, 136 | peers: peers 137 | } 138 | ]).run(connection, function(err, result){ 139 | if(err) throw err; 140 | callback(null, []); 141 | }); 142 | }); 143 | }; 144 | 145 | module.exports.newQueue = function (callback) { 146 | onConnect(function (err, connection) { 147 | r.db(dbConfig['db']) 148 | .table('crawl_queue') 149 | .delete() 150 | .run(connection, function(err, res){ 151 | if(err) throw err; 152 | r.db(dbConfig['db']) 153 | .table('crawl_queue') 154 | .insert( 155 | r.db(dbConfig['db']) 156 | .table('magnets') 157 | .filter(r.row('shouldCrawl').eq(true)) 158 | ) 159 | .run(connection, function(err, res){ 160 | //inserted 161 | if(err) throw err; 162 | callback(err, res); 163 | r.db(dbConfig['db']).table("crawl_queue") 164 | .filter(r.row('shouldCrawl').eq(true)) 165 | .count() 166 | .run(connection, function(err, res){ 167 | if(err) throw err; 168 | //console.log("Beginning crawl queue with "+res+" magnets.") 169 | }); 170 | }) 171 | }); 172 | }); 173 | }; 174 | 175 | module.exports.nextCrawl = function(magnetCount, callback){ 176 | onConnect(function (err, connection) { 177 | r.db(dbConfig['db']).table("crawl_queue") 178 | .filter({'shouldCrawl':true}) 179 | .limit(magnetCount) 180 | .run(connection, function(err, res){ 181 | if(err) throw err; 182 | res.toArray(function(err2, res2){ 183 | var ret = res2; 184 | r.db(dbConfig['db']).table("crawl_queue") 185 | .filter({'shouldCrawl':true}) 186 | .update({'shouldCrawl':false}) 187 | .run(connection, function(err3, res3){ 188 | callback(err, ret) 189 | }); 190 | }); 191 | }); 192 | }); 193 | }; 194 | 195 | module.exports.delete = function(infoHash, callback){ 196 | onConnect(function (err, connection) { 197 | r.db(dbConfig['db']).table("magnets") 198 | .filter(r.row('infoHash').eq(infoHash)) 199 | .delete() 200 | .run(connection, function(err, res){ 201 | if(err) throw err; 202 | callback(err, res); 203 | }); 204 | }); 205 | }; 206 | 207 | module.exports.export = function(callback){ 208 | onConnect(function (err, connection) { 209 | callback(err, connection, r.db(dbConfig['db'])); 210 | }); 211 | }; 212 | 213 | // #### Helper functions 214 | 215 | /** 216 | * A wrapper function for the RethinkDB API `r.connect` 217 | * to keep the configuration details in a single function 218 | * and fail fast in case of a connection error. 219 | */ 220 | function onConnect(callback) { 221 | r.connect({host: dbConfig.host, port: dbConfig.port }, function(err, connection) { 222 | assert.ok(err === null, err); 223 | connection['_id'] = Math.floor(Math.random()*10001); 224 | callback(err, connection); 225 | }); 226 | } 227 | 228 | // #### Connection management 229 | // 230 | // This application uses a new connection for each query needed to serve 231 | // a user request. In case generating the response would require multiple 232 | // queries, the same connection should be used for all queries. 233 | // 234 | // Example: 235 | // 236 | // onConnect(function (err, connection)) { 237 | // if(err) { return callback(err); } 238 | // 239 | // query1.run(connection, callback); 240 | // query2.run(connection, callback); 241 | // } 242 | // 243 | -------------------------------------------------------------------------------- /trrnts.js: -------------------------------------------------------------------------------- 1 | var bencode = require('bencode'), 2 | dgram = require('dgram'), 3 | hat = require('hat'), 4 | _ = require('lodash'); 5 | 6 | // Put in a function. The returned function won't ever throw an error. This is 7 | // quite useful for malformed messages. 8 | var makeSafe = function (fn, onFuckedUp) { 9 | return function () { 10 | try { 11 | return fn.apply(null, arguments); 12 | } catch (e) { 13 | // console.log(e); 14 | return onFuckedUp; 15 | } 16 | }; 17 | }; 18 | 19 | // See https://github.com/bencevans/node-compact2string. 20 | var compact2string = makeSafe(require('compact2string')); 21 | 22 | // Necessary formatting for the protocols we are using. 23 | var transactionIdToBuffer = makeSafe(function (transactionId) { 24 | var buf = new Buffer(2); 25 | buf.writeUInt16BE(transactionId, 0); 26 | return buf; 27 | }); 28 | 29 | // Necessary formatting for the protocols we are using. 30 | var idToBuffer = makeSafe(function (id) { 31 | return new Buffer(id, 'hex'); 32 | }); 33 | 34 | var decode = makeSafe(bencode.decode, {}), 35 | encode = makeSafe(bencode.encode, {}); 36 | 37 | var ROUTERS = [ 38 | 'router.bittorrent.com:6881', 39 | 'router.utorrent.com:6881', 40 | 'dht.transmissionbt.com:6881' 41 | ], 42 | BOOTSTRAP_NODES = ROUTERS.slice(); 43 | 44 | var nodeID = hat(160), 45 | port = process.env.UDP_PORT || 6881, 46 | socket = dgram.createSocket('udp4'); 47 | 48 | // Update our id once in a while, since we are esentially spamming the DHT 49 | // network and this might prevent other nodes from blocking us. 50 | setInterval(function () { 51 | nodeID = hat(160); 52 | }, 10000); 53 | 54 | // Key: infoHash; Value: Object representing the current results of this crawl 55 | // job (peers and nodes set using object). 56 | var jobs = {}; 57 | 58 | // Key: transactionId; Value: infoHash 59 | var transactions = {}; 60 | 61 | // This function will be invoked as soon as a node/peer sends a message. It does 62 | // a lot of formatting for the protocols. 63 | socket.on('message', function (msg, rinfo) { 64 | // Add to out bootstrap nodes. This means, we'll be able to query the DHT 65 | // network in a more direct way in the future. 66 | BOOTSTRAP_NODES.push(rinfo.address + ':' + rinfo.port); 67 | 68 | if (BOOTSTRAP_NODES.length > 100) { 69 | BOOTSTRAP_NODES.shift(); 70 | } 71 | 72 | // console.log('Received message from ' + rinfo.address); 73 | msg = decode(msg); 74 | var transactionId = Buffer.isBuffer(msg.t) && msg.t.length === 2 && msg.t.readUInt16BE(0); 75 | var infoHash = transactions[transactionId]; 76 | if (transactionId === false || infoHash === undefined || jobs[infoHash] === undefined) { 77 | return; 78 | } 79 | delete transactions[transactionId]; 80 | if (msg.r && msg.r.values) { 81 | _.each(msg.r.values, function (peer) { 82 | peer = compact2string(peer); 83 | if (peer && !jobs[infoHash].peers[peer]) { 84 | //console.log('Found new peer ' + peer + ' for ' + infoHash); 85 | jobs[infoHash].peers[peer] = true; 86 | jobs[infoHash].queue.push(peer); 87 | } 88 | }); 89 | } 90 | if (msg.r && msg.r.nodes && Buffer.isBuffer(msg.r.nodes)) { 91 | for (var i = 0; i < msg.r.nodes.length; i += 26) { 92 | var node = compact2string(msg.r.nodes.slice(i + 20, i + 26)); 93 | if (node && !jobs[infoHash].peers[node]) { 94 | //console.log('Found new node ' + node + ' for ' + infoHash); 95 | jobs[infoHash].nodes[node] = true; 96 | jobs[infoHash].queue.push(node); 97 | } 98 | } 99 | } 100 | }); 101 | 102 | // Sends the get_peers request to a node. 103 | var getPeers = function (infoHash, addr) { 104 | // console.log('Sending get_peers to ' + addr + ' for ' + infoHash); 105 | addr = addr.split(':'); 106 | var ip = addr[0], 107 | port = parseInt(addr[1]); 108 | if (port <= 0 || port >= 65536) { 109 | return; 110 | } 111 | // var transactionId = _.random(Math.pow(2, 16)); 112 | var transactionId = _.random(Math.pow(2, 12)); 113 | transactions[transactionId] = infoHash; 114 | setTimeout(function () { 115 | // Delete transaction after five seconds, if we didn't get a response. 116 | // This is extremely important. Otherwise we might get a memory leak. 117 | delete transactions[transactionId]; 118 | }, 5000); 119 | var message = encode({ 120 | t: transactionIdToBuffer(transactionId), 121 | y: 'q', 122 | q: 'get_peers', 123 | a: { 124 | id: idToBuffer(nodeID), 125 | info_hash: idToBuffer(infoHash) 126 | } 127 | }); 128 | socket.send(message, 0, message.length, port, ip); 129 | }; 130 | 131 | var crawl = function (infoHash, ttl, callback, benchmark) { 132 | console.log('[START] Crawling ' + infoHash); 133 | 134 | if (jobs[infoHash]) { 135 | return callback(new Error('Crawljob already in progress')); 136 | } 137 | 138 | var queue = []; 139 | 140 | // Packages might get lost. This sends each get_peers request multiple times. 141 | // Routers provided by BitTorrent, Inc. are sometimes down. This way we 142 | // ensure that we correctly enter the DHT network. Otherwise, we might not get 143 | // a single peer/ node. 144 | _.times(5, function () { 145 | queue = queue.concat(BOOTSTRAP_NODES); 146 | }); 147 | 148 | jobs[infoHash] = { 149 | peers: {}, 150 | nodes: {}, 151 | queue: queue 152 | }; 153 | 154 | var bench = null; 155 | if(benchmark){ 156 | bench = setInterval(function(){ 157 | console.log(_.keys(jobs[infoHash].peers).length+" "+_.keys(jobs[infoHash].nodes).length); 158 | }, 10*1000); 159 | } 160 | setTimeout(function () { 161 | 162 | // Clear interval. Don't mess up the event loop! 163 | clearInterval(crawling); 164 | 165 | if(benchmark){ 166 | clearInterval(bench); 167 | } 168 | 169 | var peers = _.keys(jobs[infoHash].peers); 170 | var nodes = _.keys(jobs[infoHash].nodes); 171 | 172 | console.log('[DONE] Done Crawling '+infoHash+'. \n Found ' + peers.length + ' peers and ' + nodes.length + ' nodes.'); 173 | 174 | // Delete the job! This is for future crawling and in order to prevent 175 | // memory leaks very important! 176 | delete jobs[infoHash]; 177 | 178 | callback(null, { 179 | peers: peers, 180 | nodes: nodes 181 | }); 182 | 183 | // Time in ms for a job to live. 184 | }, ttl); 185 | 186 | // We limit the number of outgoing UDP requests to 1000 packages per second. 187 | // We clear this interval in the setTimeout function above. 188 | var crawling = setInterval(function () { 189 | if (jobs[infoHash].queue.length > 0) { 190 | getPeers(infoHash, jobs[infoHash].queue.shift()); 191 | } 192 | }, 1); 193 | }; 194 | 195 | module.exports = exports = crawl; 196 | module.exports.init = function (callback) { 197 | socket.bind(port, callback); 198 | }; 199 | 200 | 201 | 202 | // Example usage: 203 | // var crawl = require('./crawl'); 204 | // crawl.init(function () { 205 | // crawl('8CA378DBC8F62E04DF4A4A0114B66018666C17CD', function (err, result) { 206 | // console.log(result); 207 | // process.exit(1); 208 | // }); 209 | // }); 210 | --------------------------------------------------------------------------------