├── .gitignore ├── defaults.json ├── .npmignore ├── demo.gif ├── index.js ├── test ├── basic.html ├── demo.js └── test.js ├── README.md ├── package.json └── src ├── ui.js └── listeners.js /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | -------------------------------------------------------------------------------- /defaults.json: -------------------------------------------------------------------------------- 1 | { 2 | "logger": {} 3 | } 4 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | .gitignore 4 | .npmignore 5 | test/ 6 | -------------------------------------------------------------------------------- /demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/medialab/sandcrawler-dashboard/HEAD/demo.gif -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Sandcrawler Dashboard Public Interface 3 | * ======================================= 4 | * 5 | * Just a matter of exporting the plugin function. Not something very fancy 6 | * as you might notice. 7 | */ 8 | var UI = require('./src/ui.js'), 9 | listeners = require('./src/listeners.js'), 10 | defaults = require('./defaults.json'), 11 | _ = require('lodash'); 12 | 13 | module.exports = function(opts) { 14 | 15 | return function(spider) { 16 | var ui = new UI(); 17 | 18 | return listeners(spider, ui, _.extend({}, defaults, opts)); 19 | }; 20 | }; 21 | -------------------------------------------------------------------------------- /test/basic.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Basic 5 | 6 | 7 | 21 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sandcrawler-dashboard 2 | 3 | A handy terminal dashboard displaying advanced information about one of your [sandcrawler](http://medialab.github.io/sandcrawler/) spiders. 4 | 5 | ![demo](./demo.gif) 6 | 7 | ## Installation 8 | 9 | You can install **sandcrawler-dashboard** through npm: 10 | 11 | ```bash 12 | npm install sandcrawler-dashboard 13 | ``` 14 | 15 | ## Usage 16 | 17 | ```js 18 | var sandcrawler = require('sandcrawler'), 19 | dashboard = require('sandcrawler-dashboard'); 20 | 21 | var spider = sandcrawler.spider('MyFancySpider') 22 | .use(dashboard()) 23 | .url('http://nicesite.org') 24 | .scraper(function($, done) { 25 | done(null, $('title').text()); 26 | }) 27 | .run(); 28 | ``` 29 | 30 | ## Options 31 | 32 | * **logger** *?object*: Any options to pass to the `sandcrawler-logger` used by the dashboard internally. Possible options can be found [here](https://github.com/Yomguithereal/sandcrawler-logger#options). 33 | 34 | *Example* 35 | 36 | ```js 37 | var sandcrawler = require('sandcrawler'), 38 | logger = require('sandcrawler-logger'); 39 | 40 | var spider = sandcrawler.spider('MyFancySpider') 41 | .use(dashboard({logger: {color: 'red'}})); 42 | ``` 43 | 44 | ## License 45 | 46 | MIT 47 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "sandcrawler-dashboard", 3 | "version": "0.1.1", 4 | "description": "A handy terminal dashboard plugin for sandcrawler.", 5 | "main": "index.js", 6 | "scripts": { 7 | "demo": "node ./test/demo.js", 8 | "test": "node ./test/test.js" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "https://github.com/medialab/sandcrawler-dashboard" 13 | }, 14 | "keywords": [ 15 | "sandcrawler-plugin", 16 | "dashboard" 17 | ], 18 | "contributors": [ 19 | { 20 | "name": "Daniele Guido", 21 | "url": "https://github.com/danieleguido" 22 | }, 23 | { 24 | "name": "Guillaume Plique", 25 | "url": "https://github.com/Yomguithereal" 26 | } 27 | ], 28 | "author": "yomguithereal ", 29 | "license": "MIT", 30 | "bugs": { 31 | "url": "https://github.com/medialab/sandcrawler-dashboard/issues" 32 | }, 33 | "homepage": "https://github.com/medialab/sandcrawler-dashboard", 34 | "dependencies": { 35 | "blessed": "0.0.51", 36 | "blessed-contrib": "^1.0.11", 37 | "chalk": "^1.0.0", 38 | "lodash": "^3.1.0", 39 | "sandcrawler-logger": "0.1.1" 40 | }, 41 | "devDependencies": { 42 | "express": "^4.11.2", 43 | "sandcrawler": "git+https://github.com/medialab/sandcrawler.git" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /test/demo.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Sandcrawler Dashboard Test 3 | * =========================== 4 | * 5 | * Actual dashboard test. 6 | */ 7 | var express = require('express'), 8 | sandcrawler = require('sandcrawler'), 9 | logger = require('../index.js'), 10 | _ = require('lodash'); 11 | 12 | // Helpers 13 | function randInt(min, max) { 14 | return Math.floor(Math.random() * (max - min + 1)) + min; 15 | } 16 | 17 | // Server 18 | var app = express(); 19 | app.use('/', express.static(__dirname)); 20 | 21 | // Spider 22 | var spider = sandcrawler.phantomJawa('MyJawa') 23 | .use(logger()) 24 | .config({concurrency: 4, autoRetry: 'later', maxRetries: 3}) 25 | .beforeScraping(function(req, next) { 26 | setTimeout(function() { 27 | var n = randInt(1, 10); 28 | 29 | if (n > 9) 30 | return next(new Error('discard')); 31 | else 32 | return next(); 33 | }, randInt(2, 5) * 500); 34 | }) 35 | .urls(_.range(50).map(function(i) { 36 | var n = randInt(1, 10); 37 | if (n > 9) 38 | return 'http://localhost:3002/not-found'; 39 | return 'http://localhost:3002/basic.html?' + (i + 1); 40 | })) 41 | .scraper(function($, done) { 42 | if (Math.random() > 0.5) 43 | return done(null, {hello: 'world'}); 44 | else 45 | return done(null, $('.url-list a').scrape('href')); 46 | }) 47 | .afterScraping(function(req, res, next) { 48 | var n = randInt(1, 8); 49 | 50 | if (n > 7) 51 | return next(new Error('invalid-data')); 52 | else 53 | return next(); 54 | }); 55 | 56 | // Listening 57 | var server = app.listen(3002); 58 | 59 | sandcrawler.run(spider, function(err) { 60 | server.close(); 61 | }); 62 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Sandcrawler Dashboard Test 3 | * =========================== 4 | * 5 | * Actual dashboard test. 6 | */ 7 | var express = require('express'), 8 | sandcrawler = require('sandcrawler'), 9 | logger = require('../index.js'), 10 | _ = require('lodash'); 11 | 12 | // Helpers 13 | function randInt(min, max) { 14 | return Math.floor(Math.random() * (max - min + 1)) + min; 15 | } 16 | 17 | // Server 18 | var app = express(); 19 | app.use('/', express.static(__dirname)); 20 | 21 | // Spider 22 | var spider = sandcrawler.phantomJawa('MyJawa') 23 | .use(logger()) 24 | .config({concurrency: 4, autoRetry: true, maxRetries: 3}) 25 | .beforeScraping(function(req, next) { 26 | setTimeout(function() { 27 | var n = randInt(1, 10); 28 | 29 | if (n > 9) 30 | return next(new Error('discard')); 31 | else 32 | return next(); 33 | }, randInt(2, 10) * 500); 34 | }) 35 | .urls(_.range(50).map(function(i) { 36 | var n = randInt(1, 10); 37 | if (n > 9) 38 | return 'http://localhost:3002/basic/this/is/an-insupportably-long-and-inexistant/url/just-for-thesakeofitandbecauseI/can.tm'; 39 | return 'http://localhost:3002/basic.html?' + (i + 1); 40 | })) 41 | .scraper(function($, done) { 42 | if (Math.random() > 0.5) 43 | return done(null, {hello: 'world'}); 44 | else 45 | return done(null, $('.url-list a').scrape('href')); 46 | }) 47 | .afterScraping(function(req, res, next) { 48 | var n = randInt(1, 8); 49 | 50 | if (n > 7) 51 | return next(new Error('invalid-data')); 52 | else 53 | return next(); 54 | }); 55 | 56 | // Listening 57 | var server = app.listen(3002); 58 | 59 | sandcrawler.run(spider, function(err) { 60 | server.close(); 61 | }); 62 | -------------------------------------------------------------------------------- /src/ui.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Sandcrawler Dashboard UI 3 | * ========================= 4 | * 5 | * Defining the blessed UI used by the plugin. 6 | */ 7 | var blessed = require('blessed'), 8 | contrib = require('blessed-contrib'), 9 | _ = require('lodash'); 10 | 11 | function UI() { 12 | 13 | var screen = blessed.screen(); 14 | 15 | // Log component 16 | this.log = blessed.box({ 17 | label: 'Log', 18 | top: '0', 19 | left: '0', 20 | border: { 21 | type: 'line', 22 | fg: 'blue' 23 | }, 24 | width: '60%', 25 | height: '70%', 26 | wrap: true 27 | }); 28 | this.log.lines = []; 29 | 30 | // Request component 31 | this.request = blessed.box({ 32 | label: 'Request', 33 | top: '70%', 34 | left: '0', 35 | border: { 36 | type: 'line', 37 | fg: 'blue' 38 | }, 39 | width: '30%', 40 | height: '30%' 41 | }); 42 | 43 | // Response component 44 | this.response = blessed.box({ 45 | label: 'Response', 46 | top: '70%', 47 | left: '30%', 48 | border: { 49 | type: 'line', 50 | fg: 'blue' 51 | }, 52 | width: '30%', 53 | height: '30%' 54 | }); 55 | 56 | // Job table component 57 | var table = contrib.table({ 58 | label: 'Jobs', 59 | top: '0', 60 | left: '60%', 61 | border: { 62 | type: 'line', 63 | fg: 'blue' 64 | }, 65 | width: '40%', 66 | height: '60%', 67 | columnSpacing: [6, 70, 20] 68 | }); 69 | table.jobs = {}; 70 | 71 | table.add = function(id, rows) { 72 | this.jobs[id] = rows; 73 | return this; 74 | }; 75 | 76 | table.remove = function(id) { 77 | delete this.jobs[id]; 78 | 79 | return this; 80 | }; 81 | 82 | table.find = function(id) { 83 | return this.jobs[id]; 84 | }; 85 | 86 | table.update = function() { 87 | var data = _.values(this.jobs); 88 | 89 | this.setData({ 90 | headers: ['', 'Url', 'Error'], 91 | data: data 92 | }); 93 | 94 | this.rows.select(data.length - 1); 95 | }; 96 | 97 | // Getting my style 98 | table.rows.style.selected.bg = undefined; 99 | table.rows.style.selected.fg = 'white'; 100 | 101 | table.focus(); 102 | this.jobTable = table; 103 | 104 | // Gauge component 105 | this.progressBar = blessed.ProgressBar({ 106 | label: 'Progress - 0%', 107 | top: '60%', 108 | left: '60%', 109 | border: { 110 | type: 'line', 111 | fg: 'blue' 112 | }, 113 | width: '40%', 114 | height: '10%', 115 | barBg: 'blue' 116 | }); 117 | 118 | // Stats component 119 | this.stats = blessed.box({ 120 | label: 'Stats', 121 | top: '70%', 122 | left: '60%', 123 | border: { 124 | type: 'line', 125 | fg: 'blue' 126 | }, 127 | width: '20%', 128 | height: '30%' 129 | }); 130 | 131 | this.info = blessed.box({ 132 | label: 'Information', 133 | top: '70%', 134 | left: '80%', 135 | border: { 136 | type: 'line', 137 | fg: 'blue' 138 | }, 139 | width: '20%', 140 | height: '30%' 141 | }); 142 | 143 | // Rendering the UI 144 | screen.append(this.log); 145 | screen.append(this.request); 146 | screen.append(this.response); 147 | screen.append(this.jobTable); 148 | screen.append(this.progressBar); 149 | screen.append(this.stats); 150 | screen.append(this.info); 151 | screen.render(); 152 | 153 | // Getting out of the dashboard (might get useful...) 154 | screen.key(['C-c'], function(ch, key) { 155 | return process.exit(0); 156 | }); 157 | 158 | this.screen = screen; 159 | } 160 | 161 | module.exports = UI; 162 | -------------------------------------------------------------------------------- /src/listeners.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Sandcrawler Dashboard Listeners 3 | * ================================ 4 | * 5 | * Hooking on the spider to relay information through the dahsboard's UI. 6 | */ 7 | var logger = require('sandcrawler-logger'), 8 | blessed = require('blessed'), 9 | chalk = require('chalk'), 10 | util = require('util'), 11 | nodeUrl = require('url'), 12 | _ = require('lodash'); 13 | 14 | // Helpers 15 | function pad(nb) { 16 | var nbstr = '' + nb; 17 | 18 | if (nbstr.length < 2) 19 | return '0' + nb; 20 | return nbstr; 21 | } 22 | 23 | function formatHMS(seconds) { 24 | var hours = (seconds / 3600) | 0, 25 | minutes = ((seconds - (hours * 3600)) / 60) | 0, 26 | seconds = Math.round(seconds) % 60; 27 | 28 | return pad(hours) + ':' + pad(minutes) + ':' + pad(seconds); 29 | } 30 | 31 | function formatMS(seconds) { 32 | var minutes = (seconds / 60) | 0, 33 | seconds = Math.round(seconds) % 60; 34 | return pad(minutes) + ':' + pad(seconds); 35 | } 36 | 37 | function formatUrl(url, pad) { 38 | pad = pad || 45; 39 | 40 | var truncatedUrl = url.slice(-pad); 41 | 42 | if (truncatedUrl.length !== url.length) { 43 | var parsed = nodeUrl.parse(url), 44 | root = parsed.protocol + '//' + parsed.host; 45 | 46 | if (root.length > pad - 5) { 47 | truncatedUrl = root.slice(-pad - 2) + '...'; 48 | } 49 | else { 50 | truncatedUrl = (root + '/../' + _.last(parsed.path.split('/'))); 51 | 52 | if (truncatedUrl.length > pad) 53 | truncatedUrl = root + '/../..'; 54 | } 55 | } 56 | 57 | return truncatedUrl; 58 | } 59 | 60 | // Exporting listeners 61 | module.exports = function(spider, ui, opts) { 62 | 63 | function render() { 64 | updateInformation(); 65 | return ui.screen.render(); 66 | } 67 | 68 | // Rendering every now and then... 69 | var renderInterval = setInterval(render, 30); 70 | 71 | function updateInformation() { 72 | var errors = _(spider.stats.errorIndex) 73 | .pairs() 74 | .sortBy(function(p) { 75 | return -p[1]; 76 | }) 77 | .map(function(p) { 78 | return ' ' + chalk.red(p[0]) + ' ' + p[1]; 79 | }) 80 | .value(); 81 | 82 | ui.stats.setContent([ 83 | chalk.grey.bold('Queued jobs ') + spider.stats.queued, 84 | chalk.grey.bold('In-progress jobs ') + spider.stats.doing, 85 | chalk.grey.bold('Done jobs ') + spider.stats.done, 86 | '', 87 | chalk.grey.bold('Successes ') + chalk.green(spider.stats.successes), 88 | chalk.grey.bold('Failures ') + chalk.red(spider.stats.failures), 89 | chalk.grey.bold('Success Rate ') + spider.stats.successRate + '%', 90 | '', 91 | chalk.grey.bold('Engine type ') + spider.type, 92 | chalk.grey.bold('Concurrency ') + spider.options.concurrency 93 | ].join('\n')); 94 | 95 | var elapsed = spider.stats.getElapsedTime(), 96 | estimate = spider.stats.getRemainingTimeEstimation(), 97 | average = spider.stats.averageTimePerJob; 98 | 99 | ui.info.setContent([ 100 | chalk.grey.bold('Elapsed time ') + formatHMS(elapsed), 101 | chalk.grey.bold('Remaining time ') + (estimate ? formatHMS(estimate) : ' ~'), 102 | chalk.grey.bold('Time per job ') + ' ' + (average ? formatMS(average) : ' ~'), 103 | chalk.grey.bold('Errors ') 104 | ].concat(errors).join('\n')); 105 | } 106 | 107 | // Branching the logger 108 | spider.use(logger(_.extend({ 109 | out: function(txt) { 110 | var lines = ui.log.lines, 111 | wrapped = ui.log._wrapContent(txt, ui.log.width - 2); 112 | 113 | wrapped.forEach(function(line) { 114 | lines.unshift(line); 115 | }); 116 | 117 | lines = lines.slice(0, ui.log.height - 2); 118 | 119 | ui.log.setContent(lines.reverse().join('\n')); 120 | 121 | } 122 | }, opts.logger))); 123 | 124 | // On end 125 | spider.once('spider:teardown', function() { 126 | clearInterval(renderInterval); 127 | setTimeout(function() { 128 | spider.logger.info('Press Ctrl-c to exit...'); 129 | render(); 130 | }, 10); 131 | }); 132 | 133 | // Progress bar & job table 134 | spider.on('job:start', function(job) { 135 | var j = ui.jobTable.find(job.id); 136 | 137 | var truncatedUrl = formatUrl(job.req.url); 138 | 139 | if (j) 140 | ui.jobTable.remove(job.id); 141 | 142 | ui.jobTable.add(job.id, [ 143 | ' ' + chalk.bgBlue.bold.white(' ~ ') + ' ', 144 | chalk.bold.grey(truncatedUrl), 145 | chalk.bold.white('-') 146 | ]); 147 | 148 | ui.jobTable.update(); 149 | }); 150 | 151 | spider.on('job:success', function(job) { 152 | var rows = ui.jobTable.find(job.id); 153 | 154 | rows[0] = ' ' + chalk.bgGreen.bold.white(' ✓ ') + ' '; 155 | rows[1] = chalk.bold.grey(formatUrl(job.res.url || job.req.url)); 156 | 157 | ui.jobTable.update(); 158 | }); 159 | 160 | spider.on('job:retry', function(job) { 161 | var rows = ui.jobTable.find(job.id); 162 | 163 | rows[0] = ' ' + chalk.bgMagenta.bold.white(' · ') + ' '; 164 | 165 | ui.jobTable.update(); 166 | }); 167 | 168 | spider.on('job:discard', function(err, job) { 169 | var j = ui.jobTable.find(job.id); 170 | 171 | if (j) 172 | ui.jobTable.remove(job.id); 173 | }); 174 | 175 | spider.on('job:fail', function(err, job) { 176 | var rows = ui.jobTable.find(job.id), 177 | errMessage = err.message; 178 | 179 | if (errMessage.length > 12) 180 | errMessage = errMessage.slice(0, 9) + '...'; 181 | 182 | rows[0] = ' ' + chalk.bgRed.bold.white(' ✗ ') + ' '; 183 | rows[1] = chalk.bold.grey(formatUrl(job.res.url || job.req.url)); 184 | rows[2] = chalk.red(errMessage); 185 | 186 | ui.jobTable.update(); 187 | }); 188 | 189 | function updateReqRes(err, job) { 190 | if (!job) { 191 | job = err; 192 | err = null; 193 | } 194 | 195 | // Request 196 | var reqText = ''; 197 | reqText += chalk.grey.bold('Url') + ' ' + formatUrl(job.req.url, ui.request.width - 2 - 4) + '\n'; 198 | 199 | _(job.req) 200 | .omit(['url', 'retry', 'retryNow', 'retryLater']) 201 | .forIn(function(v, k) { 202 | reqText += chalk.grey.bold(_.capitalize(k)) + ' ' + util.inspect(v, {depth: 1}) + '\n'; 203 | }) 204 | .value(); 205 | 206 | ui.request.setContent(reqText); 207 | 208 | // Response 209 | var resText = ''; 210 | 211 | resText += chalk.grey.bold('Url') + ' ' + formatUrl(job.res.url || job.req.url, ui.response.width - 2 - 4) + '\n'; 212 | 213 | if (err) 214 | resText += chalk.red.bold('Error') + ' ' + (err.message || err) + '\n'; 215 | 216 | resText += chalk[err ? 'grey' : 'green'].bold('Data') + ' ' + util.inspect(job.res.data, {depth: 1}) + '\n'; 217 | 218 | _(job.res) 219 | .omit(['url', 'body', 'data', 'error']) 220 | .forIn(function(v, k) { 221 | resText += chalk.grey.bold(_.capitalize(k)) + ' ' + util.inspect(v, {depth: 1}) + '\n'; 222 | }) 223 | .value(); 224 | 225 | ui.response.setContent(resText); 226 | }; 227 | 228 | spider.on('job:success', updateReqRes); 229 | spider.on('job:fail', updateReqRes); 230 | 231 | function updateCompletion() { 232 | var completion = spider.stats.completion; 233 | 234 | ui.progressBar.setProgress(completion); 235 | ui.progressBar.setLabel('Progress - ' + completion + '%'); 236 | } 237 | 238 | spider.on('job:add', updateCompletion); 239 | spider.on('job:end', updateCompletion); 240 | }; 241 | --------------------------------------------------------------------------------