├── test ├── mocha.opts ├── copyFile.js ├── jobTimer.js ├── fetch.js ├── config.js ├── Map.js ├── jobPool.js ├── Job.js └── extracter.js ├── benchmarks ├── Makefile ├── run └── server.js ├── index.js ├── .travis.yml ├── config.js ├── .gitignore ├── examples ├── extract.js ├── snapshot.js ├── multi-extract.js ├── multi-snapshot.js ├── snapshot2.js ├── global-viewport-snapshot.js ├── global-zoom-snapshot.js ├── job-zoom-snapshot.js └── job-viewport-snapshot.js ├── Makefile ├── .gitattributes ├── lib ├── copyFile.js ├── fetch.js ├── config.js ├── jobTimer.js ├── jobPool.js ├── Map.js ├── Job.js ├── worker.js ├── bridge.js └── extracter.js ├── package.json └── README.md /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --require should 2 | --timeout 5000 -------------------------------------------------------------------------------- /benchmarks/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | @./run ./server 3 | @echo 4 | 5 | .PHONY: all -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | exports = module.exports = require('./lib/extracter'); 2 | 3 | exports.version = '0.2.5'; -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.10" 4 | - "0.8" 5 | before_script: 6 | - phantomjs --version 7 | after_script: 8 | - make test-coveralls 9 | -------------------------------------------------------------------------------- /benchmarks/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | echo $1 4 | 5 | echo 6 | node $1 & 7 | pid=$! 8 | 9 | sleep 2 10 | 11 | wrk 'http://localhost:3000/snapshot?url=http://www.baidu.com' \ 12 | -d 30 \ 13 | -c 50 \ 14 | -t 8 \ 15 | --timeout 10s \ 16 | 17 | kill $pid -------------------------------------------------------------------------------- /config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | wsPort: 3001, 3 | workerNum: 0, 4 | 5 | maxJob: 10, 6 | maxQueueJob: 0, 7 | viewportSize: { 8 | width: 1024, 9 | height: 600 10 | }, 11 | clipRect: 0, 12 | zoomFactor: 0, 13 | javascriptEnabled: false 14 | }; -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ############ 2 | ## Windows 3 | ############ 4 | 5 | # Windows image file caches 6 | Thumbs.db 7 | 8 | # Folder config file 9 | Desktop.ini 10 | 11 | ############# 12 | ## My Project 13 | ############# 14 | 15 | node_modules/ 16 | snapshot/ 17 | examples/snapshot/ 18 | coverage.html -------------------------------------------------------------------------------- /examples/extract.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')(); 4 | 5 | urlExtract.extract('http://www.baidu.com', function (job) { 6 | console.log('This is a extract example.'); 7 | console.log(job); 8 | process.exit(); 9 | }); 10 | })(); -------------------------------------------------------------------------------- /examples/snapshot.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')(); 4 | 5 | urlExtract.snapshot('http://www.baidu.com', function (job) { 6 | console.log('This is a snapshot example.'); 7 | console.log(job); 8 | process.exit(); 9 | }); 10 | })(); -------------------------------------------------------------------------------- /examples/multi-extract.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')() 4 | , i = 1; 5 | 6 | urlExtract.extract(['http://www.baidu.com', 'http://www.qq.com', 'http://www.sina.com'], function (job) { 7 | console.log(job); 8 | if ((i++) === 3) process.exit(); 9 | }); 10 | })(); -------------------------------------------------------------------------------- /examples/multi-snapshot.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')() 4 | , i = 1; 5 | 6 | urlExtract.snapshot(['http://www.baidu.com', 'http://www.qq.com', 'http://www.sina.com'], function (job) { 7 | console.log(job); 8 | if ((i++) === 3) process.exit(); 9 | }); 10 | })(); -------------------------------------------------------------------------------- /examples/snapshot2.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')(); 4 | 5 | // Sometimes, we do not care how long the snapshot generation 6 | urlExtract.extract('http://www.baidu.com', './snapshot/test/test.png'); 7 | 8 | setTimeout(function () { 9 | process.exit(); 10 | }, 10000); 11 | })(); -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | install: 2 | @npm install 3 | 4 | test: 5 | @mocha 6 | 7 | test-cov: 8 | @mocha --require blanket -R html-cov > coverage.html 9 | @echo Please open coverage.html to see the result! 10 | 11 | test-coveralls: 12 | @mocha --require blanket --reporter mocha-lcov-reporter | COVERALLS_REPO_TOKEN="1YMpW0X8cMInhR9glhlTEM8lovs1bY9RV" ./node_modules/coveralls/bin/coveralls.js 13 | 14 | .PHONY: test test-cov test-coveralls -------------------------------------------------------------------------------- /examples/global-viewport-snapshot.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')({ 4 | viewportSize: { 5 | width: 800, 6 | height: 400 7 | }, 8 | clipRect: { 9 | top: 0, 10 | left: 0, 11 | width: 800, 12 | height: 400 13 | } 14 | }); 15 | 16 | urlExtract.snapshot('http://www.baidu.com', function (job) { 17 | console.log('This is a snapshot example.'); 18 | console.log(job); 19 | process.exit(); 20 | }); 21 | })(); -------------------------------------------------------------------------------- /test/copyFile.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | , fs = require('fs') 3 | , fileCopy = require('../lib/copyFile'); 4 | 5 | function makeSureFile(file, done) { 6 | if (fs.existsSync(file)) { 7 | fs.unlinkSync(file); 8 | done && done(); 9 | } 10 | } 11 | 12 | describe('config', function () { 13 | it('should able to copy file', function (done) { 14 | fileCopy('./package.json', './package.tmp', function (err) { 15 | if (err) throw err; 16 | makeSureFile('./package.tmp', done); 17 | }); 18 | }); 19 | }); -------------------------------------------------------------------------------- /examples/global-zoom-snapshot.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')({ 4 | viewportSize: { 5 | width: 512, 6 | height: 300 7 | }, 8 | clipRect: { 9 | top: 0, 10 | left: 0, 11 | width: 512, 12 | height: 300 13 | }, 14 | zoomFactor: 0.5 15 | }); 16 | 17 | urlExtract.snapshot('http://www.baidu.com', function (job) { 18 | console.log('This is a snapshot example.'); 19 | console.log(job); 20 | process.exit(); 21 | }); 22 | })(); -------------------------------------------------------------------------------- /examples/job-zoom-snapshot.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')(); 4 | 5 | urlExtract.snapshot('http://www.baidu.com', { 6 | viewportSize: { 7 | width: 512, 8 | height: 300 9 | }, 10 | clipRect: { 11 | top: 0, 12 | left: 0, 13 | width: 512, 14 | height: 300 15 | }, 16 | zoomFactor: 0.5, 17 | callback: function (job) { 18 | console.log('This is a snapshot example.'); 19 | console.log(job); 20 | process.exit(); 21 | } 22 | }); 23 | })(); -------------------------------------------------------------------------------- /examples/job-viewport-snapshot.js: -------------------------------------------------------------------------------- 1 | module.exports = (function () { 2 | "use strict" 3 | var urlExtract = require('../')(); 4 | 5 | urlExtract.snapshot('http://www.baidu.com', { 6 | viewportSize: { 7 | width: 800, 8 | height: 400 9 | }, 10 | clipRect: { 11 | top: 0, 12 | left: 0, 13 | width: 800, 14 | height: 400 15 | }, 16 | quality: 100, 17 | callback: function (job) { 18 | console.log('This is a snapshot example.'); 19 | console.log(job); 20 | process.exit(); 21 | } 22 | }); 23 | 24 | })(); -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | *.sln merge=union 7 | *.csproj merge=union 8 | *.vbproj merge=union 9 | *.fsproj merge=union 10 | *.dbproj merge=union 11 | 12 | # Standard to msysgit 13 | *.doc diff=astextplain 14 | *.DOC diff=astextplain 15 | *.docx diff=astextplain 16 | *.DOCX diff=astextplain 17 | *.dot diff=astextplain 18 | *.DOT diff=astextplain 19 | *.pdf diff=astextplain 20 | *.PDF diff=astextplain 21 | *.rtf diff=astextplain 22 | *.RTF diff=astextplain 23 | -------------------------------------------------------------------------------- /benchmarks/server.js: -------------------------------------------------------------------------------- 1 | var connect = require('connect') 2 | , uExtract = require('../')({ 3 | maxJob: 10 4 | }); 5 | 6 | var app = connect() 7 | .use('/snapshot', function (req, res, next) { 8 | var url = req.url.match(/\?url\=(.+)$/)[1]; 9 | if (url) { 10 | uExtract.snapshot(url, function (job) { 11 | res.writeHead(200, { 12 | 13 | }); 14 | res.end(); 15 | }); 16 | } else { 17 | next(); 18 | } 19 | }) 20 | .listen(3000, function () { 21 | console.log('Listen on port 3000'); 22 | }); -------------------------------------------------------------------------------- /lib/copyFile.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - copyFile 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | "use strict"; 8 | var fs = require('fs'); 9 | 10 | /** 11 | * copyFile 12 | * @param {String} src 13 | * @param {String} dst 14 | * @param {Function} callback 15 | */ 16 | module.exports = function (src, dst, callback) { 17 | var readStream = fs.createReadStream(src) 18 | , writeStream = fs.createWriteStream(dst); 19 | readStream.pipe(writeStream); 20 | readStream.on('end', function () { 21 | writeStream.end(); 22 | callback(null); 23 | }); 24 | readStream.on('error', callback); 25 | writeStream.on('error', callback); 26 | }; -------------------------------------------------------------------------------- /test/jobTimer.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | , Job = require('../lib/Job') 3 | , jobTimer = require('../lib/jobTimer'); 4 | 5 | describe('fetch', function () { 6 | it('should able to push job', function () { 7 | var timer = jobTimer(function () {}) 8 | , job1 = new Job('http://localhost/test1') 9 | , job2 = new Job('http://localhost/test2') 10 | timer.push([job1, job2]); 11 | timer.done(job1.id)[0].should.eql(job1); 12 | timer.done(job2.id)[0].should.eql(job2); 13 | timer.destroy(); 14 | }); 15 | 16 | it('should able to timeout', function (done) { 17 | var timer = jobTimer(function (_job) { 18 | _job.should.equal(job); 19 | timer.destroy(); 20 | done(); 21 | }, 30, 20) 22 | , job = new Job('http://localhost/test3'); 23 | timer.push([job]); 24 | }); 25 | }); -------------------------------------------------------------------------------- /test/fetch.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | , fetch = require('../lib/fetch.js'); 3 | 4 | describe('fetch', function () { 5 | it('should get the title from html string', function () { 6 | var fetchObj = fetch('test'); 7 | fetchObj.title.should.equal('test'); 8 | fetchObj.description.should.equal('No Description'); 9 | }); 10 | 11 | it('should get the description from html string', function () { 12 | var fetchObj = fetch(''); 13 | fetchObj.title.should.equal('No Title'); 14 | fetchObj.description.should.equal('Hello world'); 15 | }); 16 | 17 | it('should get the title as "No Title", when the title is null', function () { 18 | var fetchObj = fetch(''); 19 | fetchObj.title.should.equal('No Title'); 20 | }); 21 | }); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "url-extract", 3 | "version": "0.2.6", 4 | "description": "Snapshot & extract url library", 5 | "keywords": [ 6 | "snapshot", 7 | "screenshot", 8 | "extract", 9 | "url", 10 | "library", 11 | "phantomjs" 12 | ], 13 | "homepage": "http://miniflycn.github.io/url-extract", 14 | "repository": "git://github.com/miniflycn/url-extract.git", 15 | "author": "Daniel Yang ", 16 | "main": "index", 17 | "engines": { 18 | "node": ">= 0.8.0" 19 | }, 20 | "license": "MIT", 21 | "dependencies": { 22 | "uid2": "0.0.3", 23 | "node-websocket-server": "1.1.4" 24 | }, 25 | "devDependencies": { 26 | "mocha": "*", 27 | "should": "*", 28 | "connect": "*", 29 | "blanket": "1.1.5", 30 | "coveralls": "*", 31 | "mocha-lcov-reporter": "*", 32 | "debug": "*" 33 | }, 34 | "scripts": { 35 | "test": "mocha", 36 | "blanket": { 37 | "pattern": "url-extract/lib" 38 | } 39 | }, 40 | "config": { 41 | "blanket": { 42 | "pattern": "url-extract/lib" 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /lib/fetch.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - fetch 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | module.exports = (function () { 8 | "use strict"; 9 | 10 | /** 11 | * fetch 12 | * @param {String} html 13 | * @return {Object} data 14 | */ 15 | return function (html) { 16 | if (!html) return { title: false, description: false }; 17 | 18 | var title = html.match(/\(.*?)\<\/title\>/) 19 | , meta = html.match(/\/g) 20 | , description; 21 | 22 | if (meta) { 23 | for (var i = meta.length; i--;) { 24 | if (~meta[i].indexOf('name="description"') || ~meta[i].indexOf('name="Description"')){ 25 | description = meta[i].match(/content\=\"(.*?)\"/)[1]; 26 | } 27 | } 28 | } 29 | 30 | (title && title[1] !== '') ? (title = title[1]) : (title = 'No Title'); 31 | description || (description = 'No Description'); 32 | 33 | return { 34 | title: title, 35 | description: description 36 | }; 37 | }; 38 | 39 | })(); -------------------------------------------------------------------------------- /lib/config.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - config 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | module.exports = (function () { 8 | "use strict"; 9 | var config = require('../config') 10 | , emitter = (typeof process === 'object') && new (require('events').EventEmitter)() 11 | , isChange = false; 12 | 13 | /** 14 | * get 15 | * @return {Object} 16 | */ 17 | function get() { 18 | return config; 19 | } 20 | 21 | /** 22 | * set 23 | * @param {Object} opts 24 | * @return {Object} 25 | */ 26 | function set(opts) { 27 | var opt; 28 | for (opt in opts) { 29 | if (opt in config) config[opt] = opts[opt]; 30 | } 31 | emitter && emitter.emit('set', opts); 32 | isChange || (isChange = true); 33 | return config; 34 | } 35 | 36 | /** 37 | * changed 38 | * @return {Boolean} 39 | */ 40 | function changed(boolean) { 41 | if (!arguments.length) return isChange; 42 | return (isChange = boolean); 43 | } 44 | 45 | return { 46 | get: get, 47 | set: set, 48 | on: emitter && emitter.on.bind(emitter), 49 | off: emitter && emitter.removeListener.bind(emitter), 50 | changed: changed 51 | }; 52 | 53 | })(); -------------------------------------------------------------------------------- /test/config.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | , config = require('../lib/config') 3 | , configObj = require('../config'); 4 | 5 | describe('config', function () { 6 | it('should able to tell changed or not', function () { 7 | var zoomFactor = config.get().zoomFactor; 8 | config.changed(false).should.be.false; 9 | config.set({ 10 | zoomFactor: 0.5 11 | }); 12 | config.changed().should.be.true; 13 | config.set({ 14 | zoomFactor: zoomFactor 15 | }); 16 | }); 17 | 18 | it('should able to get config', function () { 19 | config.get().should.equal(configObj); 20 | }); 21 | 22 | it('should not able to set a unavailable param', function (done) { 23 | function onSet() { 24 | config.off('set', onSet); 25 | done(); 26 | } 27 | config.on('set', onSet); 28 | config.set({ 29 | xxxx: true 30 | }); 31 | assert.equal(config.get().xxxx, undefined); 32 | }); 33 | 34 | it('should able to set a param', function () { 35 | var zoomFactor = config.get().zoomFactor; 36 | config.set({ 37 | zoomFactor: 0.5 38 | }); 39 | config.get().zoomFactor.should.equal(0.5); 40 | config.set({ 41 | zoomFactor: zoomFactor 42 | }); 43 | config.get().zoomFactor.should.equal(zoomFactor); 44 | }); 45 | }); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![build status](https://secure.travis-ci.org/miniflycn/url-extract.png)](http://travis-ci.org/miniflycn/url-extract) 2 | [![Coverage Status](https://coveralls.io/repos/miniflycn/url-extract/badge.png?branch=master)](https://coveralls.io/r/miniflycn/url-extract?branch=master) 3 | # url-extract 4 | [![NPM](https://nodei.co/npm/url-extract.png)](https://npmjs.org/package/url-extract) 5 | 6 | Homepage(主页): http://miniflycn.github.io/url-extract 7 | 8 | ## Contributors 9 | https://github.com/miniflycn/url-extract/graphs/contributors 10 | 11 | 12 | ## License 13 | (The MIT License) 14 | 15 | Copyright (c) 2013 Daniel Yang 16 | 17 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /lib/jobTimer.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - jobTimer 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | module.exports = function (fail, timeout, cycleTime) { 8 | "use strict"; 9 | var jobPool = require('./jobPool'); 10 | 11 | var _jobList = [] 12 | , _timer 13 | , timeout = timeout || 60000 14 | , cycleTime = cycleTime || 10000; 15 | 16 | /** 17 | * push 18 | * @param {jobList} jobList 19 | */ 20 | function push(jobList) { 21 | var len = jobList.length 22 | , i = 0 23 | , job; 24 | for (; i < len; i++) { 25 | job = jobList[i]; 26 | job.setTime(); 27 | _jobList.push(job); 28 | } 29 | } 30 | 31 | /** 32 | * check 33 | */ 34 | function check() { 35 | // 1 min 36 | if (_jobList.length) { 37 | var job = _jobList.shift(); 38 | if (((new Date()) - job.getTime()) > timeout) { 39 | fail(job); 40 | jobPool.remove(job.id); 41 | return check(); 42 | } else { 43 | _jobList.unshift(job); 44 | } 45 | } 46 | _timer = setTimeout(function () { 47 | check(); 48 | }, cycleTime); 49 | } 50 | 51 | /** 52 | * done 53 | * @param {String} jobId 54 | */ 55 | function done(jobId) { 56 | for (var i = _jobList.length; i--;) { 57 | if (_jobList[i].id === jobId) { 58 | return _jobList.splice(i, 1); 59 | } 60 | } 61 | } 62 | 63 | /** 64 | * destroy 65 | */ 66 | function destroy() { 67 | clearTimeout(_timer); 68 | _jobList = []; 69 | } 70 | 71 | check(); 72 | 73 | return { 74 | push: push, 75 | check: check, 76 | done: done, 77 | destroy: destroy 78 | }; 79 | }; -------------------------------------------------------------------------------- /test/Map.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | , Map = require('../lib/Map'); 3 | 4 | describe('Map', function () { 5 | it('should able to set & get', function () { 6 | var map = new Map(); 7 | assert.deepEqual(map.get('test'), null); 8 | map.set('test', 'text'); 9 | map.get('test').should.equal('text'); 10 | }); 11 | 12 | it('should able to get the item number', function () { 13 | var map = new Map(); 14 | map.length.should.equal(0); 15 | map.set('test', 'text'); 16 | map.length.should.equal(1); 17 | }); 18 | 19 | it('should able to judge key exists or not', function () { 20 | var map = new Map(); 21 | map.has('test').should.be.false; 22 | map.set('test', 'text'); 23 | }); 24 | 25 | it('should able to get a value randomly', function () { 26 | var map = new Map(); 27 | map.set('test', 'text'); 28 | map.get().should.equal('text'); 29 | }); 30 | 31 | it('should able to get a item by function', function () { 32 | var map = new Map(); 33 | map.set('test', 'text'); 34 | map.get(function (value) { 35 | if (value === 'text') { 36 | return true; 37 | } 38 | return false; 39 | }).should.equal('text'); 40 | assert.deepEqual(map.get(function (value) { return false; }), null); 41 | }); 42 | 43 | it('should able to remove a key', function () { 44 | var map = new Map(); 45 | map.set('test', 'text'); 46 | map.remove('test'); 47 | map.has('test').should.be.false; 48 | assert.deepEqual(map.remove('test'), null); 49 | }); 50 | 51 | it('should able to do with each item', function () { 52 | var map = new Map(); 53 | map.set('test', 'text'); 54 | map.each(function (value) { 55 | value.should.equal('text'); 56 | }); 57 | }); 58 | 59 | it('should able to clear', function () { 60 | var map = new Map(); 61 | map.set('test', 'text'); 62 | map.length.should.equal(1); 63 | map.clear(); 64 | map.length.should.equal(0); 65 | }); 66 | 67 | it('should able to judge if it contains a value', function () { 68 | var map = new Map(); 69 | map.set('test1', 'text1'); 70 | map.set('test2', false); 71 | map.set('test3', 0); 72 | map.set('test4', undefined); 73 | map.contains('text1').should.be.true; 74 | map.contains('test1').should.be.false; 75 | map.contains(false).should.be.true; 76 | map.contains(0).should.be.true; 77 | map.contains(undefined).should.be.true; 78 | }); 79 | }); -------------------------------------------------------------------------------- /test/jobPool.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | , Job = require('../lib/Job') 3 | , jobPool = require('../lib/jobPool'); 4 | 5 | describe('jobPool', function () { 6 | it('should able to cache job', function () { 7 | var job1 = new Job('http://localhost/test1') 8 | , job2 = new Job('http://localhost/test2') 9 | , job3 = new Job('http://localhost/test3') 10 | , jobList; 11 | jobPool.push(job1); 12 | jobPool.push(job2); 13 | jobPool.push(job3); 14 | jobList = jobPool.shift(3); 15 | jobList.length.should.equal(3); 16 | }); 17 | 18 | it('should able to get a job', function () { 19 | var job = new Job('http://localhost/test4') 20 | , id = job.id; 21 | jobPool.push(job); 22 | jobPool.shift(1); 23 | jobPool.get(job.id).should.equal(job); 24 | }); 25 | 26 | it('should able to push some jobs at a time', function () { 27 | var job1 = new Job('http://localhost/test5') 28 | , job2 = new Job('http://localhost/test6') 29 | , jobList; 30 | jobPool.push([job1, job2]); 31 | jobList = jobPool.shift(5); 32 | jobList.length.should.equal(2); 33 | }); 34 | 35 | it('should able to count jobs', function () { 36 | var job1 = new Job('http://localhost/test7') 37 | , job2 = new Job('http://localhost/test8') 38 | , job3 = new Job('http://localhost/test9') 39 | , job4 = new Job('http://localhost/test10'); 40 | jobPool.push([job1, job2, job3, job4]); 41 | jobPool.count().should.equal(4); 42 | jobPool.shift(2); 43 | jobPool.shift(2); 44 | }); 45 | 46 | it('should not get a unknow job', function () { 47 | assert.deepEqual(jobPool.get(1234567), null); 48 | }); 49 | 50 | it('should able to remove job', function () { 51 | var job = new Job('http://localhost/test11'); 52 | jobPool.push(job); 53 | jobPool.remove(job.id).should.equal(job); 54 | }); 55 | 56 | it('should able to unshift job', function () { 57 | var job = new Job('http://localhost/test12'); 58 | jobPool.unshift(job); 59 | jobPool.shift(1)[0].should.equal(job); 60 | }); 61 | 62 | it('should able to unshift some job', function () { 63 | var job1 = new Job('http://localhost/test13') 64 | , job2 = new Job('http://localhost/test14') 65 | , job3 = new Job('http://localhost/test15'); 66 | jobPool.unshift([job1, job2, job3]); 67 | jobPool.count().should.equal(3); 68 | jobPool.shift(3); 69 | }); 70 | 71 | it('should able to make a job in the top of stack', function () { 72 | var job1 = new Job('http://localhost/test16') 73 | , job2 = new Job('http://localhost/test17') 74 | , job3 = new Job('http://localhost/test18'); 75 | jobPool.push(job1); 76 | jobPool.push(job2); 77 | jobPool.push(job3); 78 | jobPool.quick(job3.id); 79 | jobPool.shift(1)[0].should.equal(job3); 80 | jobPool.shift(2); 81 | }); 82 | }); -------------------------------------------------------------------------------- /lib/jobPool.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - jobPool 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | module.exports = (function () { 8 | "use strict"; 9 | var isArray = require('util').isArray; 10 | 11 | var _pool = {} 12 | , _stack = []; 13 | 14 | function _push(job) { 15 | _stack.push(job); 16 | _pool[job.id] = job; 17 | } 18 | 19 | function _unshift(job) { 20 | _stack.unshift(job); 21 | _pool[job.id] = job; 22 | } 23 | 24 | /** 25 | * push(job) 26 | * push(jobList) 27 | * @param {Job} job 28 | * @param {Array} jobList 29 | */ 30 | function push(job) { 31 | if (!isArray(job)) { 32 | return _push(job); 33 | } else { 34 | for (var i = job.length; i--;) { 35 | _push(job[i]); 36 | } 37 | } 38 | } 39 | 40 | /** 41 | * unshift(job) 42 | * unshift(jobList) 43 | * @param {Job} job 44 | * @param {Array} jobList 45 | */ 46 | function unshift(job) { 47 | if (!isArray(job)) { 48 | _unshift(job); 49 | } else { 50 | for (var i = job.length; i--;) { 51 | _unshift(job[i]); 52 | } 53 | } 54 | } 55 | 56 | /** 57 | * shift 58 | * @param {Number} num 59 | */ 60 | function shift(num) { 61 | var len = _stack.length 62 | , list; 63 | if (num < len) { 64 | list = _stack.splice(0, num); 65 | } else { 66 | list = _stack; 67 | _stack = []; 68 | } 69 | return list; 70 | } 71 | 72 | /** 73 | * get 74 | * @param {String} id 75 | */ 76 | function get(id) { 77 | var job = _pool[id]; 78 | if (job) { 79 | _pool[id] = null; 80 | delete _pool[id]; 81 | return job; 82 | } else { 83 | return null; 84 | } 85 | } 86 | 87 | /** 88 | * remove 89 | * @param {String} id 90 | */ 91 | function remove(id) { 92 | var job = _pool[id]; 93 | if (job) { 94 | for (var i = _stack.length; i--;) { 95 | if (_stack[i] === job) { 96 | _stack.splice(i, 1); 97 | break; 98 | } 99 | } 100 | return get(id); 101 | } else { 102 | return null; 103 | } 104 | } 105 | 106 | /** 107 | * count 108 | * @return {Number} 109 | */ 110 | function count() { 111 | return _stack.length; 112 | } 113 | 114 | /** 115 | * quick 116 | * @param {String} id 117 | */ 118 | function quick(id) { 119 | var job = remove(id); 120 | if (job) { 121 | return unshift(job); 122 | } 123 | } 124 | 125 | return { 126 | push: push, 127 | shift: shift, 128 | unshift: unshift, 129 | get: get, 130 | remove: remove, 131 | count: count, 132 | quick: quick 133 | }; 134 | 135 | })(); -------------------------------------------------------------------------------- /lib/Map.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - Map 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | module.exports = (function () { 8 | "use strict"; 9 | 10 | /** 11 | * Map 12 | * @class 13 | */ 14 | function Map() { 15 | this.map = {}; 16 | this._len = 0; 17 | } 18 | Map.prototype = { 19 | constructor: Map, 20 | /** 21 | * set 22 | * @param {String} key 23 | * @param {Any} value 24 | * @return {value or Boolean} 25 | */ 26 | set: function (key, value) { 27 | return (!this.has() && ++this._len && (this.map[key] = value)); 28 | }, 29 | /** 30 | * has 31 | * @param {String} key 32 | * @return {Boolean} 33 | */ 34 | has: function (key) { 35 | return (key in this.map); 36 | }, 37 | /** 38 | * get(key) 39 | * get(judge) 40 | * @param {String} key 41 | * @param {Function} judge 42 | * @return {value or null} 43 | */ 44 | get: function (key) { 45 | var map = this.map 46 | , tmp 47 | , value; 48 | key = key || function () { return true; }; 49 | if (typeof key === 'string') { 50 | return (map[key] || null); 51 | } else if (typeof key === 'function') { 52 | for (tmp in map) { 53 | value = map[tmp]; 54 | if (key(value)) return value; 55 | } 56 | } 57 | return null; 58 | }, 59 | /** 60 | * contains 61 | * @param {Unless null} value 62 | * @return {Boolean} 63 | */ 64 | contains: function (value) { 65 | return !(this.get(function (_value) { 66 | if (value === _value) return true; 67 | return false; 68 | }) === null); 69 | }, 70 | /** 71 | * remove 72 | * @param {String} key 73 | * @return {value or null} 74 | */ 75 | remove: function (key) { 76 | var value; 77 | if (value = this.get(key)) { 78 | this.map[key] = null; 79 | delete this.map[key]; 80 | --this._len; 81 | return value; 82 | } else { 83 | return null; 84 | } 85 | }, 86 | /** 87 | * each 88 | * @param {Function} foo 89 | */ 90 | each: function (foo) { 91 | var map = this.map 92 | , key 93 | , value; 94 | for (key in map) { 95 | if (!foo(map[key])) break; 96 | } 97 | }, 98 | /** 99 | * clear 100 | */ 101 | clear: function () { 102 | this.map = {}; 103 | this._len = 0; 104 | } 105 | } 106 | 107 | // define length 108 | Object.defineProperty(Map.prototype, 'length', { 109 | get: function() { 110 | return this._len; 111 | } 112 | }); 113 | 114 | return Map; 115 | })(); -------------------------------------------------------------------------------- /lib/Job.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - Job 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | module.exports = (function () { 8 | "use strict"; 9 | var uid = require('uid2'); 10 | 11 | /** 12 | * Job 13 | * @class 14 | * @param {String} url 15 | * @param {Boolean} content 16 | * @param {Object} opt 17 | * @option {String} id 18 | * @option {String} groupId 19 | * @option {Function} callback 20 | * @option {Boolean} ignoreCache 21 | * @option {String} image 22 | * @option {Object} viewportSize 23 | * @option {Object} clipRect 24 | * @option {Number} zoomFactor 25 | */ 26 | function Job(url, opt, content) { 27 | this.url = url; 28 | this.content = ((typeof opt === 'boolean') ? opt : content) || false; 29 | opt = ((typeof opt === 'boolean') ? {} : opt) || {}; 30 | this.id = opt.id || uid(10); 31 | this.groupId = opt.groupId || null; 32 | this.callback = opt.callback; 33 | this.image = opt.image; 34 | opt.viewportSize && (this.viewportSize = opt.viewportSize); 35 | opt.clipRect && (this.clipRect = opt.clipRect); 36 | opt.zoomFactor && (this.zoomFactor = opt.zoomFactor); 37 | opt.javascriptEnabled && (this.javascriptEnabled = opt.javascriptEnabled); 38 | opt.quality ? this.quality = opt.quality : this.quality = -1; 39 | } 40 | Job.prototype = { 41 | constructor: Job, 42 | /** 43 | * setData 44 | * @param {Object} data 45 | */ 46 | setData: function (data) { 47 | if (data.status) { 48 | this.status = true; 49 | if (this.content) { 50 | this.title = data.title; 51 | this.description = data.description; 52 | this.image = data.image; 53 | } else { 54 | this.image = data.image; 55 | } 56 | } else { 57 | this.status = false; 58 | } 59 | }, 60 | 61 | /** 62 | * getData 63 | * @return {Object} 64 | */ 65 | getData: function () { 66 | if (this.content) { 67 | return { 68 | title: this.title, 69 | description: this.description, 70 | image: this.image, 71 | status: this.status 72 | }; 73 | } else { 74 | return { 75 | image: this.image, 76 | status: this.status 77 | } 78 | } 79 | }, 80 | /** 81 | * setTime 82 | * @return {Date} 83 | */ 84 | setTime: function () { 85 | return (this.time = +(new Date())); 86 | }, 87 | /** 88 | * getTime 89 | * @return {Date} 90 | */ 91 | getTime: function () { 92 | return this.time; 93 | }, 94 | /** 95 | * fail 96 | * @return {Job} 97 | */ 98 | fail: function () { 99 | this.status = false; 100 | return this; 101 | } 102 | 103 | }; 104 | 105 | return Job; 106 | 107 | })(); -------------------------------------------------------------------------------- /test/Job.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | , Job = require('../lib/Job'); 3 | 4 | describe('Job', function () { 5 | it('should able to create a extract url job with group id', function () { 6 | var job = new Job('http://localhost/test', { groupId:'test' }, true); 7 | job.groupId.should.equal('test'); 8 | job.url.should.equal('http://localhost/test'); 9 | job.content.should.be.true; 10 | }); 11 | 12 | it('should able to create a extract url job without id', function () { 13 | var job = new Job('http://localhost/test', true); 14 | job.url.should.equal('http://localhost/test'); 15 | job.content.should.be.true; 16 | }); 17 | 18 | it('should able to create a snapshot job with id', function () { 19 | var job = new Job('http://localhost/test', { groupId:'test' }, false); 20 | job.groupId.should.equal('test'); 21 | job.url.should.equal('http://localhost/test'); 22 | job.content.should.be.false; 23 | }); 24 | 25 | it('should able to create a snapshot job without id', function () { 26 | var job = new Job('http://localhost/test', false); 27 | job.url.should.equal('http://localhost/test'); 28 | job.content.should.be.false; 29 | }); 30 | 31 | it('should able to set data & get data in a extract job', function () { 32 | var data = { 33 | title: 'title', 34 | description: 'description', 35 | image: 'http://localhost/test.png', 36 | status: true 37 | }; 38 | var job = new Job('http://localhost/test', true); 39 | job.setData(data); 40 | job.getData().should.eql({ 41 | title: 'title', 42 | description: 'description', 43 | image: 'http://localhost/test.png', 44 | status: true 45 | }); 46 | }); 47 | 48 | it('should able to set data & get data in a snapshot job', function () { 49 | var data = { 50 | image: 'http://localhost/test.png', 51 | status: true 52 | }; 53 | var job = new Job('http://localhost/test', false); 54 | job.setData(data); 55 | job.getData().should.eql({ 56 | image: 'http://localhost/test.png', 57 | status: true 58 | }); 59 | }); 60 | 61 | it('should able to set status is unavailable', function () { 62 | var data = { 63 | status: false 64 | }; 65 | var job = new Job('http://localhost/test'); 66 | job.setData(data); 67 | job.status.should.be.false; 68 | }); 69 | 70 | it('should able to set callback', function () { 71 | var callback = function () {}; 72 | var job1 = new Job('http://localhost/test', { callback : callback}, false) 73 | , job2 = new Job('http://localhost/test', { callback : callback}, true); 74 | job1.callback.should.equal(callback); 75 | job2.callback.should.equal(callback); 76 | }); 77 | 78 | it('should able to set id', function () { 79 | var job = new Job('http://localhost/test', { id: 'test' }); 80 | job.id.should.equal('test'); 81 | }); 82 | 83 | it('should able to set image path', function () { 84 | var job = new Job('http://localhost/test', { image: './snapshot/test.png' }); 85 | job.image.should.equal('./snapshot/test.png'); 86 | }); 87 | 88 | it('should able to get and set time', function () { 89 | var job = new Job('http://localhost/test'); 90 | job.setTime().should.equal(job.getTime()); 91 | }); 92 | 93 | it('should able to set the status', function () { 94 | var job = new Job('http://localhost/test'); 95 | job.fail().status.should.be.false; 96 | }); 97 | }); -------------------------------------------------------------------------------- /test/extracter.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert') 2 | , fs = require('fs') 3 | , connect = require('connect') 4 | , extracter = require('../lib/extracter')() 5 | , bridge = require('../lib/bridge'); 6 | 7 | var testSever = connect().use('/test', function (req, res, next) { 8 | res.end('test') 9 | }).listen(7777); 10 | 11 | var _image; 12 | 13 | function makeSureImage(image, done) { 14 | if (fs.existsSync(image)) { 15 | fs.unlinkSync(image); 16 | fs.rmdirSync(image.slice(0, image.lastIndexOf('/') + 1)); 17 | done && done(); 18 | } else { 19 | throw new Error('Image is not existed'); 20 | } 21 | } 22 | 23 | describe('extracter', function () { 24 | it('should able to create a extract url job', function (done) { 25 | var _job; 26 | extracter.bind(function (job) { 27 | job.id.should.equal(_job.id); 28 | job.title.should.equal('test'); 29 | job.description.should.equal('Just a test.'); 30 | makeSureImage(job.image, done); 31 | }); 32 | _job = extracter.extract('http://localhost:7777/test/1'); 33 | }); 34 | 35 | it('should able to create a snapshot url job', function (done) { 36 | var _job; 37 | extracter.bind(function (job) { 38 | job.id.should.equal(_job.id); 39 | job.content.should.be.false; 40 | makeSureImage(job.image, done); 41 | extracter.bind(); 42 | }); 43 | _job = extracter.snapshot('http://localhost:7777/test/2'); 44 | }); 45 | 46 | it('should able to set a callback for a extract url job', function (done) { 47 | var _job = extracter.extract('http://localhost:7777/test/3', function (job) { 48 | job.id.should.equal(_job.id); 49 | job.content.should.be.true; 50 | _image = job.image; 51 | makeSureImage(job.image, done); 52 | }); 53 | }); 54 | 55 | it('should able to set a callback for a snapshot url job', function (done) { 56 | var _job = extracter.snapshot('http://localhost:7777/test/4', function (job) { 57 | job.id.should.equal(_job.id); 58 | job.content.should.be.false; 59 | _image = job.image; 60 | makeSureImage(job.image, done); 61 | }); 62 | }); 63 | 64 | it('should able to extract more than one url at a time', function (done) { 65 | var num = 1; 66 | extracter.extract(['http://localhost:7777/test/1', 'http://localhost:7777/test/3'], function (job) { 67 | if ((num++) === 2) return done(); 68 | }); 69 | }); 70 | 71 | it('should able to snapshot more than one url at a time', function (done) { 72 | var num = 1; 73 | extracter.snapshot(['http://localhost:7777/test/2', 'http://localhost:7777/test/4'], function (job) { 74 | if ((num++) === 2) return done(); 75 | }); 76 | }); 77 | 78 | it('should able to extract more than one url at a time with groupId', function (done) { 79 | var num = 1; 80 | extracter.extract(['http://localhost:7777/test/1', 'http://localhost:7777/test/3'], { 81 | groupId: 'test1', 82 | callback: function (job) { 83 | job.groupId.should.equal('test1'); 84 | if ((num++) === 2) return done(); 85 | } 86 | }); 87 | }); 88 | 89 | it('should able to snapshot more than one url at a time with groupId', function (done) { 90 | var num = 1; 91 | extracter.snapshot(['http://localhost:7777/test/2', 'http://localhost:7777/test/4'], { 92 | groupId: 'test2', 93 | callback: function (job) { 94 | job.groupId.should.equal('test2'); 95 | if ((num++) === 2) return done(); 96 | } 97 | }); 98 | }); 99 | 100 | it('should able to save the snapshot in a specified the path', function (done) { 101 | extracter.snapshot('http://localhost:7777/test/5', { 102 | image: './snapshot/test/test.png', 103 | callback: function (job) { 104 | makeSureImage('./snapshot/test/test.png', done); 105 | } 106 | }); 107 | }); 108 | 109 | it('should able to set maxJob param & reset the free worker', function (done) { 110 | function onGet(connectionId, num) { 111 | (num === 50) && bridge.off('get', onGet); 112 | extracter.reset(1, function () { 113 | done(); 114 | }); 115 | } 116 | bridge.on('get', onGet); 117 | extracter.opt({ 118 | maxJob: 50 119 | }); 120 | }); 121 | 122 | it('should able to make sure url is valid or not', function (done) { 123 | var _job = extracter.snapshot('localhost:7777/test/6', function (job) { 124 | job.status.should.be.false; 125 | done(); 126 | }); 127 | _job.should.be.false; 128 | }); 129 | 130 | it('should throw a error when try to initializ url-extract more than one time', function () { 131 | (function () { 132 | require('../lib/extracter')({ 133 | maxJob: 50 134 | }); 135 | }).should.throwError(/^Sorry.*/); 136 | }); 137 | 138 | it('should able to get url-extract module', function () { 139 | require('../lib/extracter')().should.equal(extracter); 140 | }); 141 | 142 | }); -------------------------------------------------------------------------------- /lib/worker.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - worker 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | var webpage = require('webpage') 8 | , args = require('system').args 9 | , write = require('system').stdout.write 10 | , fetch = require('./fetch') 11 | , id = args[1] 12 | , currentNum = 0 13 | , now = 0 14 | , canGetJobs = false 15 | , socketMan 16 | , configMod = require('./config') 17 | , config = configMod.get(); 18 | 19 | function _check() { 20 | var freeNum = (++now + config.maxJob) - currentNum; 21 | if (freeNum > config.maxJob / 4 && canGetJobs) { 22 | canGetJobs = false; 23 | socketMan.get(freeNum); 24 | } 25 | } 26 | 27 | /** 28 | * doJob 29 | * @param {String} jobId 30 | * @param {String} url 31 | * @imagePath {String} imagePath 32 | * @content {Boolean} content, whether it needs to fetch content 33 | */ 34 | function doJob(job) { 35 | var page = webpage.create() 36 | , jobId = job.id 37 | , url = job.url 38 | , imagePath = job.image 39 | , content = job.content 40 | , viewportSize = job.viewportSize || config.viewportSize 41 | , clipRect = job.clipRect || config.clipRect 42 | , zoomFactor = job.zoomFactor || config.zoomFactor 43 | , send 44 | , begin 45 | , save 46 | , end; 47 | viewportSize && (page.viewportSize = viewportSize); 48 | clipRect && (page.clipRect = clipRect); 49 | zoomFactor && (page.zoomFactor = zoomFactor); 50 | page.settings = { 51 | javascriptEnabled: job.javascriptEnabled || config.javascriptEnabled, 52 | loadImages: true, 53 | userAgent: 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/1.9.0' 54 | }; 55 | page.open(url, function (status) { 56 | var data 57 | , fetchObj; 58 | if (status === 'fail') { 59 | data = { 60 | id: jobId, 61 | url: url, 62 | status: false 63 | }; 64 | // release the memory 65 | page.close(); 66 | // send data to NodeJS 67 | write('{{begin}}' + JSON.stringify(data) + '{{end}}'); 68 | //(++now === currentNum) && (setTimeout(function () { socketMan.get(config.maxJob) }, 500)); 69 | _check(); 70 | 71 | } else { 72 | setTimeout(function () { 73 | page.render(imagePath, {quality: job.quality}); 74 | if (content) { 75 | fetchObj = fetch(page.content); 76 | data = { 77 | jobId: jobId, 78 | url: url, 79 | title: fetchObj.title, 80 | description: fetchObj.description, 81 | image: imagePath, 82 | status: true 83 | }; 84 | } else { 85 | data = { 86 | jobId: jobId, 87 | url: url, 88 | image: imagePath, 89 | status: true 90 | }; 91 | } 92 | // release the memory 93 | page.close(); 94 | // send data to NodeJS 95 | write('{{begin}}' + JSON.stringify(data) + '{{end}}'); 96 | //(++now === currentNum) && (setTimeout(function () { socketMan.get(config.maxJob) }, 500)); 97 | 98 | _check(); 99 | 100 | }, 200); 101 | } 102 | }); 103 | } 104 | 105 | /** 106 | * begin 107 | * @param {jobList} jobs 108 | */ 109 | function begin(jobs) { 110 | var i = jobs.length 111 | , job 112 | , imagePath; 113 | if (i) { 114 | for (; i--;) { 115 | job = jobs[i]; 116 | job.image = job.image || './snapshot/' + id + '/' + job.id + '.png'; 117 | doJob(job); 118 | currentNum++; 119 | } 120 | canGetJobs = true; 121 | } 122 | } 123 | 124 | /** 125 | * socketMan 126 | * @static 127 | * @class 128 | */ 129 | socketMan = { 130 | websocket: undefined, 131 | shutdown: false, 132 | /** 133 | * createWs 134 | * connect websocket server 135 | */ 136 | createWs: function () { 137 | // unfortunately websocket will cost more than 1s to open 138 | var websocket = new WebSocket('ws://localhost:' + config.wsPort + '/') 139 | , that = this; 140 | that.websocket = websocket; 141 | websocket.onopen = function (evt) { 142 | that.shutdown = false; 143 | that.get(config.maxJob); 144 | }; 145 | websocket.onmessage = function (evt) { 146 | that.onMessage(evt); 147 | }; 148 | websocket.onerror = function (msg) {}; 149 | websocket.onclose = function () { 150 | if (!that.shutdown) { 151 | that.shutdown = true; 152 | setTimeout(function () { 153 | that.createWs(); 154 | }, 500); 155 | } else { 156 | phantom.exit(); 157 | } 158 | }; 159 | }, 160 | /** 161 | * onMessage 162 | * @param {Event} evt 163 | */ 164 | onMessage: function (evt) { 165 | var data = JSON.parse(evt.data); 166 | if (data.method === 'POST') { 167 | begin(data.jobList); 168 | } else if (data.method === 'CLOSE') { 169 | phantom.exit(); 170 | } else if (data.method === 'CONFIG') { 171 | configMod.set(data.opts); 172 | if (config.maxJob) return this.get(config.maxJob); 173 | } 174 | }, 175 | /** 176 | * get 177 | * @param {Number} num, the number of jobs worker want to get 178 | */ 179 | get: function (num) { 180 | this.websocket.send(JSON.stringify({ 181 | method: 'GET', 182 | num: num, 183 | id: id 184 | })); 185 | } 186 | }; 187 | socketMan.createWs(); -------------------------------------------------------------------------------- /lib/bridge.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - bridge 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | module.exports = (function () { 8 | "use strict"; 9 | var ws = require('node-websocket-server') 10 | , spawn = require('child_process').spawn 11 | , emitter = new (require('events').EventEmitter)() 12 | , configMod = require('./config') 13 | , config = configMod.get() 14 | , websocketMan 15 | , workerMan 16 | , _connectionId2WorkerId = {}; 17 | 18 | configMod.on('set', function (opts) { 19 | var msg = { 20 | method: 'CONFIG', 21 | opts: opts 22 | }; 23 | websocketMan.broadcast(msg); 24 | }); 25 | /** 26 | * websocketMan 27 | * @static 28 | * @class 29 | */ 30 | websocketMan = { 31 | websocket: undefined, 32 | /** 33 | * init 34 | */ 35 | init: function () { 36 | var that = this; 37 | this.websocket = ws.createServer(); 38 | this.websocket.addListener('connection', function (connection) { 39 | connection.addListener('message', function (msg) { 40 | msg = JSON.parse(msg); 41 | !(_connectionId2WorkerId[connection.id]) && 42 | (_connectionId2WorkerId[connection.id] = msg.id) && 43 | configMod.changed() && 44 | that.send(connection.id, { 45 | method: 'CONFIG', 46 | opts: config 47 | }); 48 | return emitter.emit('get', connection.id, msg.num); 49 | }); 50 | connection.addListener('close', function () { 51 | emitter.emit(connection.id + 'Close'); 52 | // connection close 53 | }); 54 | }); 55 | this.websocket.listen(config.wsPort); 56 | }, 57 | /** 58 | * send 59 | */ 60 | send: function (connectionId, msg) { 61 | this.websocket.send(connectionId, JSON.stringify(msg)); 62 | }, 63 | /** 64 | * broadcast 65 | */ 66 | broadcast: function (msg) { 67 | this.websocket.broadcast(JSON.stringify(msg)); 68 | } 69 | }; 70 | 71 | /** 72 | * websocketMan 73 | * @static 74 | * @class 75 | */ 76 | workerMan = { 77 | sub: 'w', 78 | mid: 0, 79 | /** 80 | * init 81 | */ 82 | init: function (num) { 83 | for (; num--;) { 84 | this.createWorker(); 85 | } 86 | }, 87 | /** 88 | * getId 89 | */ 90 | getId: function () { 91 | return this.sub + (this.mid++) + Math.round(Math.random() * 10); 92 | }, 93 | /** 94 | * createWorker 95 | * create a worker and handle its event 96 | */ 97 | createWorker: function () { 98 | var id = this.getId() 99 | , worker = spawn('phantomjs', [__dirname + '/worker.js', id, '--disk-cache=true']) 100 | , that = this; 101 | worker.stdout.setEncoding('utf8'); 102 | worker.stdout.on('data', function (data) { 103 | that.check(data); 104 | }); 105 | worker.stderr.setEncoding('utf8'); 106 | worker.stderr.on('data', function (data) { 107 | console.log('PhantomJS worker ' + id + ' has occur a error: ' + data); 108 | }); 109 | worker.on('close', function (code) { 110 | var key; 111 | for (key in _connectionId2WorkerId) { 112 | if (_connectionId2WorkerId[key] === id) { 113 | emitter.emit('died', key); 114 | _connectionId2WorkerId[key] = null; 115 | delete _connectionId2WorkerId[key]; 116 | break; 117 | } 118 | } 119 | emitter.emit(id + 'Died'); 120 | 121 | // worker died 122 | that.createWorker(); 123 | }); 124 | }, 125 | /** 126 | * check 127 | * @param {String} string 128 | */ 129 | check: function (string) { 130 | if (!~string.indexOf('{{end}}')) return; 131 | var match = string.match(/\{\{begin\}\}(.*?)\{\{end\}\}/); 132 | match && emitter.emit('data', JSON.parse(match[1])); 133 | } 134 | }; 135 | 136 | /** 137 | * init 138 | * @param {Object} opts 139 | * @param {Function} opts.onGet 140 | * @param {Function} opts.onPost 141 | * @param {Function} opts.onData 142 | */ 143 | function init(opts) { 144 | var workerNum = opts.workerNum || require('os').cpus().length; 145 | websocketMan.init(); 146 | workerMan.init(workerNum); 147 | opts.onGet && emitter.on('get', opts.onGet); 148 | opts.onData && emitter.on('data', opts.onData); 149 | opts.onDied && emitter.on('died', opts.onDied); 150 | } 151 | 152 | /** 153 | * send 154 | * @param {String} connectionId 155 | * @param {Array} jobList 156 | * @param {Job} jobList[n] 157 | * @param {String} jobList[n].id 158 | * @param {String} jobList[n].url 159 | * @param {Boolean} jobList[n].content 160 | */ 161 | function send(connectionId, jobList) { 162 | var msg = { 163 | method: 'POST', 164 | jobList: jobList 165 | }; 166 | websocketMan.send(connectionId, msg); 167 | } 168 | 169 | /** 170 | * close 171 | * @param {String} connectionId 172 | * @param {Function} callback 173 | */ 174 | function close(connectionId, callback) { 175 | websocketMan.send(connectionId, { method: 'CLOSE' }); 176 | callback && emitter.once(_connectionId2WorkerId[connectionId] + 'Died', callback); 177 | } 178 | 179 | return { 180 | init: init, 181 | send: send, 182 | close: close, 183 | on: emitter.on.bind(emitter), 184 | off: emitter.removeListener.bind(emitter) 185 | }; 186 | 187 | })(); -------------------------------------------------------------------------------- /lib/extracter.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * url-extract - Snapshot & extract url library 3 | * Copyright(c) 2013 Radica Systems Limited 4 | * Copyright(c) 2013 Daniel Yang 5 | * MIT Licensed 6 | */ 7 | "use strict"; 8 | var _extracter 9 | , configMod = require('./config'); 10 | 11 | module.exports = function (opts) { 12 | if (!_extracter){ 13 | return (function (opts) { 14 | opts && configMod.set(opts); 15 | var fs = require('fs') 16 | , isArray = require('util').isArray 17 | , config = configMod.get() 18 | , bridge = require('./bridge') 19 | , Job = require('./Job') 20 | , jobPool = require('./jobPool') 21 | , Map = require('./Map') 22 | , jobTimer = require('./jobTimer')(function (job) { 23 | _onData(job.fail()); 24 | }) 25 | , validUrl = /^https?:\/\//; 26 | 27 | var _noop = function () {} 28 | , _callback = _noop 29 | , _freeWorker = new Map() 30 | , _connectionId2jobList = new Map() 31 | , _job2connectionId = {} 32 | , _unfinish = config.unfinish || _noop; 33 | 34 | function _pushFreeWorker(connectionId, num) { 35 | var free = _freeWorker.get(connectionId); 36 | if (free) { 37 | free.num = num; 38 | } else { 39 | _freeWorker.set(connectionId, { 40 | connectionId: connectionId, 41 | num: num 42 | }); 43 | } 44 | } 45 | 46 | function _pushJobList(connectionId, jobList) { 47 | for (var i = jobList.length; i--;) { 48 | _job2connectionId[jobList[i].id] = connectionId; 49 | } 50 | 51 | var list = _connectionId2jobList.get(connectionId); 52 | if (list) { 53 | _connectionId2jobList.remove(connectionId); 54 | list = list.concat(jobList); 55 | _connectionId2jobList.set(connectionId, list); 56 | } else { 57 | _connectionId2jobList.set(connectionId, jobList); 58 | } 59 | } 60 | 61 | function _finishJob(jobId) { 62 | var connectionId = _job2connectionId[jobId]; 63 | if (connectionId) { 64 | var jobList = _connectionId2jobList.get(connectionId); 65 | for (var i = jobList.length; i--;) { 66 | if (jobId === jobList[i].id) { 67 | var j = jobList.splice(i, 1); 68 | } 69 | } 70 | _job2connectionId[jobId] = null; 71 | delete _job2connectionId[jobId]; 72 | } 73 | } 74 | 75 | function _onGet(connectionId, num) { 76 | var jobList = jobPool.shift(num) 77 | , len = jobList.length; 78 | if (len > 0) { 79 | bridge.send(connectionId, jobList); 80 | _pushJobList(connectionId, jobList); 81 | jobTimer.push(jobList); 82 | len < num ? _pushFreeWorker(connectionId, num - len) : _freeWorker.remove(connectionId); 83 | } else { 84 | _pushFreeWorker(connectionId, num); 85 | } 86 | } 87 | 88 | function _onData(data) { 89 | var job = data instanceof Job ? data : jobPool.get(data.jobId); 90 | if (job) { 91 | job.setData(data); 92 | if (job.callback) { 93 | job.callback(job); 94 | } else { 95 | _callback(job); 96 | } 97 | var j = jobTimer.done(data.jobId); 98 | _finishJob(data.jobId); 99 | } else { 100 | // Timeout job is done. 101 | } 102 | } 103 | 104 | function _onDied(connectionId) { 105 | var jobList = _connectionId2jobList.get(connectionId); 106 | _connectionId2jobList.remove(connectionId); 107 | var unfinish = []; 108 | if (jobList) { 109 | for (var i = 0; i < jobList.length; i++) { 110 | var jobs = jobTimer.done(jobList[i].id); 111 | if (typeof jobs != 'undefined' && jobs.length > 0 && jobs[0] instanceof Job) { 112 | unfinish.push(jobs[0]); 113 | } 114 | } 115 | unfinish.length > 0 && 116 | _unfinish({ 117 | _: unfinish, 118 | push: function () { 119 | return jobPool.push(unfinish); 120 | }, 121 | unshift: function () { 122 | return jobPool.unshift(unfinish); 123 | } 124 | }); 125 | } 126 | } 127 | 128 | 129 | // init 130 | bridge.init({ 131 | onGet: _onGet, 132 | onData: _onData, 133 | onDied: _onDied, 134 | workerNum: config.workerNum 135 | }); 136 | 137 | function _success(job, isPriority) { 138 | isPriority ? jobPool.unshift(job) : jobPool.push(job); 139 | if (_freeWorker.length > 0) { 140 | var param = _freeWorker.get(); 141 | return _onGet(param.connectionId, param.num); 142 | } 143 | } 144 | 145 | function _checkMaxJob(num) { 146 | return (config.maxQueueJob ? (config.maxQueueJob >= jobPool.count() + num) : true); 147 | } 148 | 149 | function _push(job) { 150 | var i 151 | , _job; 152 | if (isArray(job)) { 153 | if (_checkMaxJob(job.length)) { 154 | for (i = job.length; i--;) { 155 | _job = job[i]; 156 | validUrl.test(_job.url) ? 157 | _success(_job) : 158 | _onData(_job.fail()); 159 | } 160 | return job; 161 | } else { 162 | for (i = job.length; i--;) { 163 | _onData(job[i].fail()); 164 | } 165 | return false; 166 | } 167 | } else { 168 | if (_checkMaxJob(1) && validUrl.test(job.url)) { 169 | _success(job); 170 | return job; 171 | } else { 172 | _onData(job.fail()); 173 | return false; 174 | } 175 | } 176 | } 177 | 178 | function _create(url, opt, content) { 179 | (typeof opt === 'function') ? (opt = { callback: opt }) : 180 | (typeof opt === 'string') && (opt = { image : opt }); 181 | var job 182 | , jobList = [] 183 | , i; 184 | if (isArray(url)) { 185 | if (opt) { 186 | opt.id && (opt.id = undefined); 187 | opt.image && (opt.image = undefined); 188 | } 189 | for (i = url.length; i--;) { 190 | job = new Job(url[i], opt, content); 191 | jobList.push(job); 192 | } 193 | return _push(jobList); 194 | } else { 195 | job = new Job(url, opt, content); 196 | return _push(job); 197 | } 198 | } 199 | 200 | /** 201 | * extract(url[s], opt) 202 | * extract(url[s], callback) 203 | * extract(url, image) 204 | * @param {String} url 205 | * @param {Array} urls 206 | * @param {Object} opt 207 | * @option {String} id 208 | * @option {String} groupId 209 | * @option {Function} callback 210 | * @option {Boolean} ignoreCache 211 | * @option {Object} viewportSize 212 | * @option {Object} clipRect 213 | * @option {Number} zoomFactor 214 | * @option {String} image 215 | */ 216 | function extract(url, opt) { 217 | return _create(url, opt, true); 218 | } 219 | 220 | /** 221 | * snapshot(url[s], opt) 222 | * snapshot(url[s], callback) 223 | * snapshot(url, image) 224 | * @param {String} url 225 | * @param {Array} urls 226 | * @param {Object} opt 227 | * @option {String} id 228 | * @option {String} groupId 229 | * @option {Function} callback 230 | * @option {Boolean} ignoreCache 231 | * @option {Object} viewportSize 232 | * @option {Object} clipRect 233 | * @option {Number} zoomFactor 234 | * @option {String} image 235 | */ 236 | function snapshot(url, opt) { 237 | return _create(url, opt, false); 238 | } 239 | 240 | /** 241 | * bind 242 | * @param {Function} callback 243 | */ 244 | function bind(callback) { 245 | _callback = callback ? callback : _noop; 246 | } 247 | 248 | /** 249 | * reset(num, callback) 250 | * reset(num) 251 | * reset(callback) 252 | * reset() 253 | * @param {Number} num 254 | * @param {Function} callback 255 | */ 256 | function reset(num, callback) { 257 | callback = callback || ((typeof num === 'function') ? num : undefined); 258 | num = num || -1; 259 | _freeWorker.each(function (worker) { 260 | if (worker.num === config.maxJob) { 261 | bridge.close(worker.connectionId, callback); 262 | _freeWorker.remove(worker.connectionId); 263 | if (!(--num)) return false; 264 | } 265 | return true; 266 | }); 267 | } 268 | 269 | function close() { 270 | _freeWorker.each(function (worker) { 271 | bridge.close(worker.connectionId); 272 | _freeWorker.remove(worker.connectionId); 273 | }); 274 | } 275 | 276 | _extracter = { 277 | extract: extract, 278 | snapshot: snapshot, 279 | bind: bind, 280 | reset: reset, 281 | opt: configMod.set, 282 | quick: jobPool.quick, 283 | close: close 284 | }; 285 | 286 | return _extracter; 287 | })(opts); 288 | } else { 289 | if (opts) throw new Error('Sorry, url-extract could not be initialized more than one time.'); 290 | return _extracter; 291 | } 292 | }; --------------------------------------------------------------------------------