├── .gitignore ├── LICENSE.txt ├── README.md ├── examples ├── crawl-alexa-1m │ ├── createHar.js │ ├── master.js │ ├── top-1m-dummy.csv │ └── worker.js ├── message-passing │ ├── master.js │ └── worker.js ├── minimal │ ├── master.js │ └── worker.js └── simple-crawler │ ├── master.js │ └── worker.js ├── lib ├── master │ ├── Pool.js │ └── WorkerControl.js ├── phantomjs-pool.js └── worker │ └── Worker.js └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .idea 3 | examples/crawl-alexa-1m/results 4 | examples/crawl-alexa-1m/top-1m.csv -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Thomas Dondorf 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PhantomJS Pool Library 2 | 3 | Create a pool of PhantomJS workers. 4 | 5 | ## Install 6 | 7 | `npm install phantomjs-pool` 8 | 9 | Additionally, get the PhantomJS binaries (via `npm install phantomjs` or `npm install phantomjs2`) 10 | or download the binary file yourself. 11 | 12 | ## Usage 13 | 14 | Check out the examples directory. Here is the minimal example, which saves screenshots of the Google search for the numbers from 0 to 9 with four workers. 15 | 16 | #### master.js 17 | 18 | var Pool = require('phantomjs-pool').Pool; 19 | 20 | function jobCallback(job, worker, index) { 21 | 22 | if (index < 10) { // we just use the index as our data 23 | job(index, function(err) { 24 | console.log('DONE: ' + index); 25 | }); 26 | } else { // no more jobs 27 | job(null); 28 | } 29 | } 30 | 31 | var pool = new Pool({ 32 | numWorkers : 4, 33 | jobCallback : jobCallback, 34 | workerFile : __dirname + '/worker.js' // location of the worker file (as an absolute path) 35 | }); 36 | pool.start(); 37 | 38 | #### worker.js 39 | 40 | var webpage = require('webpage'); 41 | 42 | module.exports = function(data, done, worker) { 43 | var page = webpage.create(); 44 | 45 | // search for the given data (which contains the index number) and save it as screenshot 46 | page.open('https://www.google.com/search?q=' + data, function() { 47 | page.render('google' + data + '.png'); 48 | done(null); 49 | }); 50 | 51 | }; 52 | 53 | ## How does it work? 54 | 55 | The master file (master.js in the example) is executed via Node.js and spawns multiple PhantomJS processes. 56 | The PhantomJS process creates a server to communicate with the master process. 57 | That way the data from the master is submitted to the worker. 58 | The worker file (worker.js in the example) is embedded into the PhantomJS environment and given the data of the master process. 59 | After executing the job, the worker can call the done function to signal that another job can be executed. 60 | 61 | Some of the features of the library: 62 | * Interoperability between Node.js (master) and PhantomJS (worker) 63 | * Distribution of jobs between workers 64 | * Simple error reporting, error handling and logging 65 | * Restart of workers if necessary (due to [memory leaks](https://github.com/ariya/phantomjs/issues/11390)) 66 | * Recreation of workers if crashed (due to [segmentation fault](https://github.com/ariya/phantomjs/issues/13175)) 67 | * Restarts workers which are stuck (not calling the done function) 68 | 69 | 70 | ## Documentation 71 | 72 | ### Master (Pool) 73 | 74 | Require the library to get access to `Pool`: 75 | 76 | var Pool = require('phantomjs-pool').Pool; 77 | 78 | The constructor has the following options: 79 | 80 | * `workerFile` -- This is the PhantomJS JavaScript file that contains the logic for the worker. 81 | Make sure to use an absolute path or simply use `__dirname` followed by the path to your file. 82 | * `jobCallback` -- Expects a function which is called each time a worker is ready to receive a job. This function is described in detail below. 83 | * `phantomjsBinary` (optional) -- The path to the PhantomJS binary. You can leave this field if you have the `phantomjs` or `phantomjs2` 84 | npm module installed. If available, the library will use the `phantomjs` module. If this is not available it will use the `phantomjs2` module. 85 | As an alternative you can simply download the binary yourself and use the property to specify the path. 86 | * `numWorkers` (default: `2`) -- Number of PhantomJS workers used. This represents how many websites can be crawled simultaneously. 87 | Depending on the system resources and available network throughput a value between 4 and 20 might be desirable. 88 | * `spawnWorkerDelay` (default: `0`) -- Most of the time we do not want to spawn all workers at the same time, 89 | as this would result in a network peak at the beginning. 90 | The given number is interpreted as delay between the spawning of two workers. 91 | If the value is set to 100, the first worker will spawn instantly, the second worker will spawn with a 100ms delay, 92 | the third will spawn with a 200ms delay and so on. 93 | * `phantomjsOptions` (default: `[]`) -- Expects an array containing command line arguments for the PhantomJS binary. 94 | This can be used when using a proxy or another feature of PhantomJS that needs to be passed via command line. 95 | Example: `["--proxy=127.0.0.1:8080", "--proxy-type=http"]` 96 | * `verbose` (default: `false`) -- If the flag is set to true, the library outputs the communication between 97 | master and worker and some additional information which might help resolve problems. 98 | * `workerTimeout` (default: `120000` = 2min) -- This number represents the time in milliseconds a worker can work without giving feedback. 99 | If a worker does not respond after that time, the process will be killed and the job will be marked as erroneous. 100 | 101 | #### jobCallback 102 | 103 | The provided `jobCallback` function is called each time a worker is ready to receive a job. 104 | The function is called with three arguments: `job`, `worker`, `index` 105 | * `job(data[, callback])` -- Is a function that expects two arguments. The first argument contains the data that will be send to the worker. 106 | This needs to be a valid JSON object (properties like functions will not be sent to the worker). The second argument is optional 107 | and can be used to provide a callback function which will be called when the job is executed 108 | (for simplicity, let's name the function `afterJobCallback`). 109 | The `afterJobCallback` function is called after the worker executed the job with an error and other information: `afterJobCallback(error, data)` 110 | The first parameter (`error`) contains `null` if the job was successfull or an object of type `Error`. To read the error message use `error.message`. 111 | The error can either be a library-specific error message, a PhantomJS error message or a message that has been declared by the worker script (via the error sent in the `done` function). 112 | The `data` object contains the data that is sent by the worker using the `done` function. If the worker did not send any data, `data` is undefined. 113 | * `worker` -- Contains information about the worker. Currently this is only the ID. Each worker gets an ID (starting at 0). 114 | * `index` -- The value is `0` for the first call of the `jobCallback` function and increments for each following job. 115 | This allows to make use of arrays in a very simple manner. 116 | 117 | ### Worker 118 | 119 | The exports object needs to be a single function, that will be called with three arguments: `data`, `done`, `worker` 120 | 121 | module.exports = function(data, done, worker) { /* ... */ } 122 | 123 | #### data 124 | 125 | The data object contains the data object that has been send via the `job` function in the `jobCallback` function. 126 | 127 | #### done 128 | 129 | The `done` function needs to be called by the script after the execution of the job. 130 | The first parameter can contain an error. The second parameter can contain additional information. 131 | 132 | Examples: 133 | * `done()` -- The job has been executed successfully. No additional data is provided for the master. 134 | * `done(null, { foo : "bar" })` -- The job has been executed successfully. The additional data will be passed to the master. This can be any valid JSON object. 135 | See the `jobCallback` function to read where the data will be received. 136 | * `done(new Error("Crawl Error"))` -- An error happened during the execution. The error reason should be passed in the constructor. 137 | Additional information that is added to the error object will not be send to the master. 138 | Therefore, do not add additional properties to the error object. Use the second object to send additional data. 139 | * `done(new Error("Crawl Error"), { problem : "...", foo : [1,2,3] })` -- And error happened. The second object can again be used to send additional information. 140 | 141 | 142 | #### worker 143 | 144 | The worker object contains information about the worker itself. Currently, this is only the ID of the worker. 145 | 146 | * `id` -- ID of the worker, e.g. `worker.id` is `2` for the third worker (starting at zero). 147 | 148 | ## License 149 | 150 | MIT License. -------------------------------------------------------------------------------- /examples/crawl-alexa-1m/createHar.js: -------------------------------------------------------------------------------- 1 | // Based on https://gist.github.com/dongyuwei/3689928 2 | 3 | if (!Date.prototype.toISOString) { 4 | Date.prototype.toISOString = function () { 5 | function pad(n) { return n < 10 ? '0' + n : n; } 6 | function ms(n) { return n < 10 ? '00'+ n : n < 100 ? '0' + n : n } 7 | return this.getFullYear() + '-' + 8 | pad(this.getMonth() + 1) + '-' + 9 | pad(this.getDate()) + 'T' + 10 | pad(this.getHours()) + ':' + 11 | pad(this.getMinutes()) + ':' + 12 | pad(this.getSeconds()) + '.' + 13 | ms(this.getMilliseconds()) + 'Z'; 14 | } 15 | } 16 | 17 | function createHAR(address, title, startTime, endTime, resources) { 18 | var entries = []; 19 | 20 | resources.forEach(function (resource) { 21 | var request = resource.request, 22 | startReply = resource.startReply, 23 | endReply = resource.endReply; 24 | 25 | if (!request || !startReply || !endReply) { 26 | return; 27 | } 28 | 29 | // Exclude Data URI from HAR file because 30 | // they aren't included in specification 31 | if (request.url.substr(0, 5).toLowerCase() === 'data:') { 32 | return; 33 | } 34 | 35 | entries.push({ 36 | startedDateTime: request.time.toISOString(), 37 | time: endReply.time - request.time, 38 | request: { 39 | method: request.method, 40 | url: request.url, 41 | httpVersion: "HTTP/1.1", 42 | cookies: [], 43 | headers: request.headers, 44 | queryString: [], 45 | headersSize: -1, 46 | bodySize: -1 47 | }, 48 | response: { 49 | status: endReply.status, 50 | statusText: endReply.statusText, 51 | httpVersion: "HTTP/1.1", 52 | cookies: [], 53 | headers: endReply.headers, 54 | redirectURL: "", 55 | headersSize: -1, 56 | bodySize: startReply.bodySize, 57 | content: { 58 | size: startReply.bodySize, 59 | mimeType: endReply.contentType 60 | } 61 | }, 62 | cache: {}, 63 | timings: { 64 | blocked: 0, 65 | dns: -1, 66 | connect: -1, 67 | send: 0, 68 | wait: startReply.time - request.time, 69 | receive: endReply.time - startReply.time, 70 | ssl: -1 71 | }, 72 | pageref: address 73 | }); 74 | }); 75 | 76 | return { 77 | log: { 78 | version: '1.2', 79 | creator: { 80 | name: 'PhantomJS', 81 | version: phantom.version.major + '.' + phantom.version.minor + '.' + phantom.version.patch 82 | }, 83 | pages: [{ 84 | startedDateTime: startTime.toISOString(), 85 | id: address, 86 | title: title, 87 | pageTimings: { 88 | onLoad: endTime - startTime 89 | } 90 | }], 91 | entries: entries 92 | } 93 | }; 94 | } 95 | 96 | module.exports = createHAR; -------------------------------------------------------------------------------- /examples/crawl-alexa-1m/master.js: -------------------------------------------------------------------------------- 1 | 2 | var fs = require('fs'); 3 | var Pool = require('../../lib/phantomjs-pool').Pool; 4 | 5 | var ALEXA_FILE = 'atop-1m.csv'; 6 | var ALEXA_SIZE = 5; //1000000; 7 | 8 | // Check if the Alexa Top 1 Million files was downloaded, otherwise we will simply use the dummy file 9 | if (!fs.existsSync(ALEXA_FILE)) { 10 | console.log('Please download and unzip the Alexa 1 Million file top-1m.csv and place it in this directory:'); 11 | console.log('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'); 12 | 13 | console.log(''); 14 | console.log('We will now continue using a dummy file that only contains 20 entries.'); 15 | 16 | 17 | ALEXA_FILE = 'top-1m-dummy.csv'; 18 | ALEXA_SIZE = 20; 19 | 20 | setTimeout(startCrawling, 2000); 21 | } else { 22 | startCrawling(); 23 | } 24 | 25 | function startCrawling() { 26 | console.log('Reading ' + ALEXA_FILE); 27 | var lines = fs.readFileSync(ALEXA_FILE).toString().split('\n'); 28 | console.log(' - Done.'); 29 | 30 | var total = 0; 31 | var successful = 0; 32 | 33 | // Called when a worker is ready for a new job 34 | // job is the function that needs to be called to execute the job 35 | // index contains a number (starting at 0) that is increased with each jobCallback call 36 | function jobCallback(job, worker, index) { 37 | 38 | if (index < ALEXA_SIZE) { 39 | var line = lines[index].trim(); 40 | 41 | var split = line.split(','); 42 | var id = parseInt(split[0]); 43 | var url = split[1]; 44 | job({ 45 | id : id, 46 | url : url 47 | }, function(err) { 48 | // Lets log if it worked 49 | total++; 50 | if (err) { 51 | console.log('Problem #' + worker.id + ': ' + err.message + ' for line: ' + line); 52 | } else { 53 | console.log(' #' + worker.id + ' DONE: ' + line); 54 | successful++; 55 | } 56 | if (total % 10 === 0) { 57 | console.log('################## ' + successful + '/' + total + ' (success/crawled) ################################'); 58 | } 59 | }); 60 | } else { 61 | // no more content! 62 | job(null); 63 | } 64 | } 65 | 66 | var pool = new Pool({ 67 | numWorkers : 4, 68 | // verbose : true, // enable if you want to see more 69 | jobCallback : jobCallback, 70 | workerFile : __dirname + '/worker.js' // location of our worker file (as an absolute path) 71 | }); 72 | pool.start(); 73 | } -------------------------------------------------------------------------------- /examples/crawl-alexa-1m/top-1m-dummy.csv: -------------------------------------------------------------------------------- 1 | 1,google.com 2 | 2,facebook.com 3 | 3,youtube.com 4 | 4,baidu.com 5 | 5,yahoo.com 6 | 6,wikipedia.org 7 | 7,amazon.com 8 | 8,twitter.com 9 | 9,taobao.com 10 | 10,qq.com 11 | 11,google.co.in 12 | 12,live.com 13 | 13,linkedin.com 14 | 14,sina.com.cn 15 | 15,weibo.com 16 | 16,yahoo.co.jp 17 | 17,tmall.com 18 | 18,google.co.jp 19 | 19,ebay.com 20 | 20,google.de -------------------------------------------------------------------------------- /examples/crawl-alexa-1m/worker.js: -------------------------------------------------------------------------------- 1 | 2 | var webpage = require('webpage'); 3 | var fs = require('fs'); 4 | 5 | var createHar = require('./createHar'); 6 | 7 | module.exports = function(data, done, worker) { 8 | 9 | var page = webpage.create(); 10 | page.clearCookies(); 11 | page.clearMemoryCache(); 12 | page.settings.resourceTimeout = 30000; 13 | 14 | var resources = []; 15 | var startTime = -1; 16 | var endTime = -1; 17 | 18 | var address = 'http://' + data.url; 19 | 20 | // PhantomJS own onLoadFinished event does not work always so we basically have to implement it on our own 21 | // For this we check the requests and as soon as for some time there are no outgoing requests (and all responses 22 | // have arrived) we assume the page is loaded 23 | 24 | // if after 100ms no other request is made we think the page is loaded 25 | var FINAL_TIMEOUT = 100; 26 | 27 | var finalCheckTimeout = null; 28 | var openRequests = 0; 29 | 30 | var isLoaded = false; 31 | function pageLoaded(status) { 32 | if (!isLoaded) { 33 | isLoaded = true; 34 | if (status !== 'success') { 35 | done(new Error('Crawl Error: ' + page.reason + ' for ' + page.reason_url)); 36 | } else { 37 | logPage(); 38 | } 39 | } 40 | } 41 | 42 | function logPage() { 43 | var endTime = new Date(); 44 | var title = page.evaluate(function () { 45 | return document.title; 46 | }); 47 | 48 | var har = createHar(address, title, startTime, endTime, resources); 49 | 50 | // we dont want to have 1m files in one directory, we would prefer to have 1m files divided into 1000 directories 51 | var dirId = parseInt(data.id / 1000)*1000; 52 | var fileName = __workerDirname + '/results/' + dirId + '/' + data.id + '-' + data.url.replace(/[^\w.,;+\-]/g, '_') + '.json'; 53 | fs.write(fileName, JSON.stringify(har, null, 4), 'w'); 54 | 55 | done(); 56 | } 57 | 58 | 59 | 60 | 61 | page.onLoadStarted = function () { 62 | startTime = new Date(); 63 | }; 64 | 65 | page.onResourceRequested = function (req) { 66 | clearTimeout(finalCheckTimeout); 67 | resources[req.id] = { 68 | request: req, 69 | startReply: null, 70 | endReply: null 71 | }; 72 | openRequests++; 73 | }; 74 | 75 | page.onResourceReceived = function (res) { 76 | if (res.stage === 'start') { 77 | resources[res.id].startReply = res; 78 | } else if (res.stage === 'end') { 79 | resources[res.id].endReply = res; 80 | openRequests--; 81 | 82 | if (openRequests === 0) { 83 | finalCheckTimeout = setTimeout(function() { 84 | if (!isLoaded) { 85 | console.log('ALTERNATIVE LOADING EVENT!') 86 | } 87 | pageLoaded('success'); // we assume everything is fine 88 | }, FINAL_TIMEOUT); 89 | } 90 | } 91 | }; 92 | 93 | page.onResourceError = function (resourceError) { 94 | page.reason = resourceError.errorString; 95 | page.reason_url = resourceError.url; 96 | }; 97 | 98 | page.onError = function (msg, trace) { 99 | // we actually just ignore errors happening on the page itself 100 | }; 101 | 102 | page.open(address, pageLoaded); 103 | 104 | }; 105 | -------------------------------------------------------------------------------- /examples/message-passing/master.js: -------------------------------------------------------------------------------- 1 | 2 | var Pool = require('../../lib/phantomjs-pool').Pool; 3 | 4 | function jobCallback(job, worker, index) { 5 | job({ 6 | index : index, 7 | moreData : "Hello World!" 8 | }, function(err, data) { 9 | if (err) { 10 | console.log('We got an error for worker #' + err.workerId + ': ' + err.message); 11 | } else { 12 | console.log('I got data back from worker #' + data.workerId + ': ' + data.indexBack + ' (more data: ' + data.foo + ').'); 13 | } 14 | }); 15 | } 16 | 17 | 18 | var pool = new Pool({ 19 | numWorkers : 3, 20 | jobCallback : jobCallback, 21 | workerFile : __dirname + '/worker.js' 22 | }); 23 | pool.start(); -------------------------------------------------------------------------------- /examples/message-passing/worker.js: -------------------------------------------------------------------------------- 1 | 2 | module.exports = function(data, done, worker) { 3 | 4 | console.log('Logging works too! Yay!'); 5 | 6 | if (worker.id === 1) { 7 | // let's create an error case everytime worker 1 does something 8 | setTimeout(function() { 9 | done(new Error('Error, I dont work for this worker!')); 10 | }, 2000); 11 | } else { 12 | var data = { 13 | workerId : worker.id, 14 | indexBack : data.index, // this does not make any sense, but let's just send the index back 15 | foo : 'Greetings, Friend!' 16 | }; 17 | 18 | // I let it look like we did some work... 19 | setTimeout(function() { 20 | done(null, data); 21 | }, 5000 + Math.random()); 22 | 23 | } 24 | }; -------------------------------------------------------------------------------- /examples/minimal/master.js: -------------------------------------------------------------------------------- 1 | 2 | var Pool = require('../../lib/phantomjs-pool').Pool; 3 | 4 | function jobCallback(job, worker, index) { 5 | 6 | if (index < 10) { // we just use the index as our data 7 | job(index, function(err) { 8 | console.log('DONE: ' + index); 9 | }); 10 | } else { // no more jobs 11 | job(null); 12 | } 13 | } 14 | 15 | var pool = new Pool({ 16 | numWorkers : 4, 17 | jobCallback : jobCallback, 18 | workerFile : __dirname + '/worker.js' // location of our worker file (as an absolute path) 19 | }); 20 | pool.start(); -------------------------------------------------------------------------------- /examples/minimal/worker.js: -------------------------------------------------------------------------------- 1 | 2 | var webpage = require('webpage'); 3 | 4 | module.exports = function(data, done, worker) { 5 | var page = webpage.create(); 6 | 7 | // search for the given data (which contains the index number) and save it as screenshot 8 | page.open('https://www.google.com/search?q=' + data, function() { 9 | page.render('google' + data + '.png'); 10 | done(null); 11 | }); 12 | 13 | }; -------------------------------------------------------------------------------- /examples/simple-crawler/master.js: -------------------------------------------------------------------------------- 1 | 2 | var Pool = require('../../lib/phantomjs-pool').Pool; 3 | 4 | var pages = [ 5 | 'http://www.google.com/', 6 | 'http://www.example.com/', 7 | 'http://www.stackoverflow.com/', 8 | 'http://phantomjs.org/', 9 | 'http://www.nodejs.org/', 10 | 'http://www.reddit.com/', 11 | 'http://www.youtube.com/', 12 | 'http://www.amazon.com/' 13 | ]; 14 | 15 | // Called when a worker is ready for a new job 16 | // job is the function that needs to be called to execute the job 17 | // index contains a number (starting at 0) that is increased with each jobCallback call 18 | function jobCallback(job, worker, index) { 19 | 20 | // as long as we have urls that we want to crawl we execute the job 21 | var url = pages[index]; 22 | if (index < pages.length) { 23 | 24 | // the first argument contains the data which is passed to the worker 25 | // the second argument is a callback which is called when the job is executed 26 | job({ 27 | url : url, 28 | id : index 29 | }, function(err) { 30 | // Lets log if it worked 31 | if (err) { 32 | console.log('There were some problems for url ' + url + ': ' + err.message); 33 | } else { 34 | console.log('DONE: ' + url + '(' + index + ')'); 35 | } 36 | }); 37 | } else { 38 | // if we have no more jobs, we call the function job with null 39 | job(null); 40 | } 41 | } 42 | 43 | var pool = new Pool({ 44 | numWorkers : 3, 45 | jobCallback : jobCallback, 46 | workerFile : __dirname + '/worker.js' // location of our worker file (as an absolute path) 47 | }); 48 | pool.start(); -------------------------------------------------------------------------------- /examples/simple-crawler/worker.js: -------------------------------------------------------------------------------- 1 | 2 | var webpage = require('webpage'); 3 | 4 | // worker needs to export one function which is called with the job 5 | module.exports = function(data, done, worker) { 6 | 7 | // data contains the data we passed to the job function in the master file 8 | // done is a function which needs to be called to signal that the job is executed 9 | // worker contains some meta data about this worker (like the id) 10 | 11 | // we just fetch the page and save it as an image normally 12 | var page = webpage.create(); 13 | page.open(data.url, function() { 14 | page.render(data.id + '.png'); 15 | 16 | // then we call the done function with null to signal we sucessfully executed the job 17 | done(null); 18 | }); 19 | 20 | }; -------------------------------------------------------------------------------- /lib/master/Pool.js: -------------------------------------------------------------------------------- 1 | 2 | var Worker = require('./WorkerControl'); 3 | var fs = require('fs'); 4 | 5 | function Pool(options) { 6 | this.size = options.numWorkers || 2; 7 | this.spawnWorkerDelay = options.spawnWorkerDelay || 0; 8 | this.phantomjsOptions = options.phantomjsOptions || []; 9 | this.verbose = options.verbose || false; 10 | this.workerTimeout = options.workerTimeout || 120 * 1000; 11 | 12 | this.jobIndex = 0; 13 | 14 | if (options.phantomjsBinary) { 15 | this.phantomjsBinary = options.phantomjsBinary; 16 | } else { 17 | // Check if PhantomJS is installed 18 | var phantomjsLib; 19 | try { 20 | phantomjsLib = require('phantomjs-prebuilt'); 21 | } catch (e) {} // Do nothing, we were just checking 22 | try { 23 | phantomjsLib = require('phantomjs'); 24 | } catch (e) {} 25 | try { 26 | phantomjsLib = require('phantomjs2'); 27 | } catch (e) {} 28 | 29 | if (phantomjsLib) { 30 | this.phantomjsBinary = phantomjsLib.path; 31 | } else { 32 | throw new Error('PhantomJS binary not found. Use the option phantomjsBinary or install phantomjs via npm.'); 33 | } 34 | } 35 | 36 | if (!options.workerFile) { 37 | throw new Error('workerFile in options expected.'); 38 | } 39 | this.workerFile = options.workerFile; 40 | 41 | this.jobCallback = options.jobCallback; 42 | if (!options.jobCallback) { 43 | throw new Error('jobCallback in options expected.'); 44 | } 45 | 46 | this.workers = []; 47 | } 48 | 49 | // Adds workers until the pool size is reached 50 | Pool.prototype.spawnWorkers = function () { 51 | var that = this; 52 | if (this.size > this.workers.length) { 53 | this.addWorker(); 54 | setTimeout(function () { 55 | that.spawnWorkers(); 56 | }, this.spawnWorkerDelay); 57 | } 58 | }; 59 | 60 | // adds one worker to the pool 61 | Pool.prototype.addWorker = function () { 62 | if (this.verbose) { 63 | console.log('Creating worker #' + this.workers.length); 64 | } 65 | this.workers.push(Worker.create(this)); 66 | }; 67 | 68 | Pool.prototype.getJob = function(jobCallback, workerData) { 69 | this.jobCallback(jobCallback, workerData, this.jobIndex); 70 | this.jobIndex++; 71 | }; 72 | 73 | Pool.prototype.start = function () { 74 | if (this.verbose) { 75 | console.log('Starting spawning workers'); 76 | } 77 | this.spawnWorkers(); 78 | }; 79 | 80 | module.exports = Pool; -------------------------------------------------------------------------------- /lib/master/WorkerControl.js: -------------------------------------------------------------------------------- 1 | 2 | var cp = require('child_process'); 3 | var http = require('http'); 4 | var querystring = require('querystring'); 5 | 6 | var phantomjsBinPath = '/../../bin/phantomjs'; 7 | 8 | var VERBOSE = false; 9 | 10 | function log(workerId, msg) { 11 | if (VERBOSE) { 12 | console.log(' #' + workerId + ' ' + msg); 13 | } 14 | } 15 | 16 | function createError(workerId, msg) { 17 | var err = new Error(msg); 18 | err.workerId = workerId; 19 | return err; 20 | } 21 | 22 | // Number of current workers, to give new workers an id 23 | var workerId = 0; 24 | 25 | function Worker(pool) { 26 | this.id = workerId; 27 | workerId++; 28 | this.workerData = { 29 | id : this.id 30 | }; 31 | this.pool = pool; 32 | this.createProcess(); 33 | this.waitingTimeout = null; 34 | if (this.pool.verbose) { 35 | VERBOSE = true; 36 | } 37 | 38 | this.alive = true; 39 | } 40 | 41 | // Create process of PhantomJS worker 42 | Worker.prototype.createProcess = function() { 43 | 44 | // first kill the old worker process if there ist still one 45 | if (this.proc) { 46 | log(this.id, 'killing worker'); 47 | this.proc.kill(); 48 | } 49 | 50 | var that = this; 51 | that.port = undefined; 52 | 53 | var clArgs = [__dirname + '/../../lib/worker/Worker.js', this.id, this.pool.workerFile]; 54 | if (this.pool.phantomjsOptions) { 55 | clArgs.unshift.apply(clArgs, this.pool.phantomjsOptions); 56 | } 57 | 58 | // Spawn process 59 | this.proc = cp.spawn(that.pool.phantomjsBinary, clArgs, { cwd : process.cwd() }); 60 | this.proc.on('error', function (err) { 61 | if (err.message.indexOf('ENOENT') !== -1) { 62 | throw new Error('phantomjsBinary not found: ' + that.pool.phantomjsBinary + ' (Full error: ' + err.message + ')'); 63 | } else { 64 | throw new Error('Problem starting the PhantomJS process: ' + err.message); 65 | } 66 | }); 67 | this.proc.stdout.on('data', function (rawData) { 68 | var data = rawData.toString(); 69 | 70 | // parse first data from the worker and interpret it as port number or output it 71 | if (that.port === undefined && data.indexOf('#|#port#|#') !== -1) { 72 | var splitted = data.split('#|#port#|#'); 73 | that.port = parseInt(splitted[1]); 74 | log(that.id, ' starting on port: ' + that.port); 75 | 76 | // we are now ready setup and can start working 77 | that.readyForWork(); 78 | } else { 79 | // output logging calls of the custom worker of the user 80 | data.split('\n').forEach(function(line) { 81 | if (line.trim().length !== 0) { 82 | console.log(' #' + that.id + ' >> ' + line); 83 | } 84 | }); 85 | } 86 | }); 87 | 88 | // This should not happen, but just in case, we log it... 89 | this.proc.stderr.on('data', function (data) { 90 | data.toString().split('\n').forEach(function(line) { 91 | if (line.trim().length !== 0) { 92 | console.log(' #' + that.id + ' STDERR >> ' + line); // TODO: Write this into STDERR 93 | } 94 | }); 95 | }); 96 | 97 | // If the process is killed or closed we want to start another one 98 | this.proc.on('close', function (code, signal) { 99 | log(that.id, 'process closed'); 100 | clearTimeout(that.waitingTimeout); // remove timeout (which checks if worker is stuck) if we have one running 101 | 102 | // only do all that, if we did not close the process on our own 103 | if (signal !== 'SIGTERM') { 104 | // if we close the process on our own, we have already opened the next proc, so lets not set it to null 105 | that.proc = null; // there is no process anymore attached to this worker 106 | 107 | // code == 0 -> means worker closed as expected after he crawled several websites 108 | // (planned closing because of memory leak problems) 109 | if (code !== 0) { // sigterm means we killed the worker on our own 110 | log(that.id, 'closed with error code ' + code + ', signal: ' + signal); 111 | // use callback to signal error 112 | if (that.currentJob && that.currentJob.callback) { 113 | that.currentJob.callback(createError(that.id, 'PhantomJS error, closing signal: ' + signal)); 114 | } 115 | } 116 | 117 | // if worker is still needed, restart process 118 | if (that.alive) { 119 | log(that.id, 'recreating phantomjs instance'); 120 | that.createProcess(); 121 | } 122 | } 123 | 124 | }); 125 | 126 | }; 127 | 128 | 129 | // called when the worker has no job and is ready to receive work 130 | Worker.prototype.readyForWork = function() { 131 | if (this.currentJob) { 132 | log(this.id, 'ignoring the last job: ' + JSON.stringify(this.currentJob.data)); 133 | } 134 | 135 | var that = this; 136 | this.pool.getJob(function (data, doneCallback) { 137 | if (data === null) { // no more data, we can close this worker 138 | if (that.proc) { 139 | log(that.id, 'closing worker'); 140 | that.proc.kill(); 141 | } 142 | that.alive = false; 143 | } else if (!that.alive) { 144 | throw createError(that.id, 'Worker was already closed. You cannot reuse a closed worker!'); 145 | } else { 146 | that.work(data, doneCallback); 147 | } 148 | }, this.workerData); 149 | }; 150 | 151 | // called by master -> contains a new job and a callback that should be called when the job is done or erroneous 152 | Worker.prototype.work = function(data, givenJobCallback) { 153 | var that = this; 154 | that.currentJob = { 155 | data : data, 156 | callback : givenJobCallback 157 | }; 158 | log(this.id, 'new job ' + JSON.stringify(data)); 159 | 160 | function jobCallback(err, data) { 161 | if (givenJobCallback) { 162 | givenJobCallback(err, data); 163 | } 164 | } 165 | 166 | // we will now send this job the the phantomJS instance via REST 167 | // the phantomJS instance has a port opened for this which accepts REST calls 168 | 169 | // The data we want to submit via POST 170 | var postData = querystring.stringify({ 171 | data : JSON.stringify(data) 172 | }); 173 | 174 | // parameters for the request 175 | var options = { 176 | hostname: '127.0.0.1', 177 | port: this.port, 178 | path: '/', 179 | method: 'POST', 180 | headers: { 181 | 'Content-Type': 'application/x-www-form-urlencoded', 182 | 'Content-Length': postData.length 183 | } 184 | }; 185 | 186 | // start a timeout that kills the job and process if we do not receive an answer from the worker in time 187 | that.waitingTimeout = setTimeout(function() { 188 | log(that.id, 'worker seems to be dead, we got no response for ' + JSON.stringify(data) + ' / ' + (new Date()).toString()); 189 | jobCallback(createError(that.id, 'Worker Timeout')); 190 | that.waitingTimeout = null; 191 | workerRequest.abort(); 192 | 193 | that.createProcess(); // this will kill the current running job and restart a new process 194 | }, that.pool.workerTimeout); 195 | 196 | // the actual request 197 | var workerRequest = http.request(options, function(res) { 198 | var body = ''; 199 | res.on('data', function (chunk) { 200 | body += chunk; // append chunks to get the whole body 201 | }); 202 | 203 | // we got our response, let's check what's in the box 204 | res.on('end', function () { 205 | if (that.waitingTimeout) { 206 | clearTimeout(that.waitingTimeout); // clear the "worker did not answer" timeout 207 | log(that.id, 'received result: ' + body); 208 | try { 209 | // parse results and pass them to our callback 210 | var result = JSON.parse(body); 211 | } catch (jsonParseError) { 212 | // if that happens, we are in trouble 213 | jobCallback(createError(that.id, 'JSON.parse error (content: ' + body + ')')); 214 | that.createProcess(); 215 | } 216 | if (result.status === 'success') { 217 | jobCallback(null, result.data); 218 | } else if (result.status === 'fail') { 219 | jobCallback(createError(that.id, result.errMessage), result.data); 220 | } else { 221 | jobCallback(createError(that.id, 'Communication error between Master and Worker')); 222 | result.closing = true; 223 | that.createProcess(); 224 | } 225 | that.currentJob = null; 226 | 227 | // check if phatomjs instance will close down 228 | // if the worker signals he is closing, then we just wait for its closing 229 | // otherwise we get a job for the worker 230 | if (!result.closing) { 231 | that.readyForWork(); 232 | } 233 | } 234 | }); 235 | }); 236 | 237 | workerRequest.on('error', function(e) { 238 | // should only happen if the worker somehow does not answer and we kill the process 239 | log(that.id, 'problem with request: ' + e.message); 240 | }); 241 | 242 | // send request 243 | workerRequest.write(postData); 244 | workerRequest.end(); 245 | }; 246 | 247 | 248 | // factory for simplicity 249 | Worker.create = function(id, callback) { 250 | return new Worker(id, callback); 251 | }; 252 | 253 | module.exports = Worker; -------------------------------------------------------------------------------- /lib/phantomjs-pool.js: -------------------------------------------------------------------------------- 1 | 2 | var Pool = require('./master/Pool'); 3 | 4 | module.exports = { 5 | Pool : Pool 6 | }; -------------------------------------------------------------------------------- /lib/worker/Worker.js: -------------------------------------------------------------------------------- 1 | var webserver = require('webserver'); 2 | var system = require('system'); 3 | 4 | // our workerId as assigned by the master 5 | var workerId = parseInt(system.args[system.args.length-2]); 6 | var workerData = { 7 | id : workerId 8 | }; 9 | 10 | // location of the users worker file 11 | var workerFile = system.args[system.args.length-1]; 12 | 13 | var customWorker; 14 | (function() { 15 | // setup some helper variables the script can use 16 | var lastSlash = Math.max(workerFile.lastIndexOf('/'), workerFile.lastIndexOf('\\')); 17 | __workerDirname = workerFile.substr(0, lastSlash); 18 | __workerFilename = workerFile; 19 | customWorker = require(workerFile); 20 | }()); 21 | 22 | 23 | // how many jobs to work before we restart // TODO this should be configurable 24 | var REQUESTS_BEFORE_WORKER_RESTART = 30; 25 | 26 | // count requests to close if the max number (above) is reached 27 | var totalRequests = 0; 28 | 29 | function workerRequest(req, res) { 30 | totalRequests++; 31 | 32 | // job was executed, lets inform the master 33 | function jobDone(err, data) { 34 | // TODO: check if function was already called before 35 | 36 | // check if we close the connection after this (to prevent memory leaks) 37 | var closing = totalRequests > REQUESTS_BEFORE_WORKER_RESTART ? true : false; 38 | 39 | var msg = {}; 40 | 41 | if (err) { 42 | msg.errMessage = err.message; 43 | msg.status = 'fail'; 44 | closing = true; // always close the worker if any error happens 45 | } else { 46 | msg.status = 'success'; 47 | } 48 | msg.data = data; 49 | msg.closing = closing; 50 | 51 | // send our data back to the master 52 | res.statusCode = 200; 53 | res.write(JSON.stringify(msg)); 54 | res.close(); 55 | 56 | // close this worker if necessary 57 | if (closing) { 58 | phantom.exit(); 59 | } 60 | } 61 | 62 | // contains our job data 63 | var data = req.post.data; 64 | var parsedData = JSON.parse(data); 65 | // we pass this to our customWorker 66 | if (data) { 67 | try { 68 | customWorker(parsedData, jobDone, workerData); 69 | } catch (e) { 70 | res.statusCode = 200; 71 | res.write(JSON.stringify({ 72 | errMessage : e.message, 73 | status : 'fail', 74 | closing : true 75 | })); 76 | res.close(); 77 | phantom.exit(); 78 | } 79 | } else { 80 | // sometimes the server seems to have problems receiving any data 81 | res.statusCode = 200; 82 | res.write(JSON.stringify({ 83 | status : 'fail', 84 | data : 'No data for worker received ' + JSON.stringify(req.post) 85 | })); 86 | res.close(); 87 | } 88 | } 89 | 90 | // we create a simple HTTP web server 91 | var server = webserver.create(); 92 | 93 | // we want to find a port to open a REST server 94 | // select port randomly until we find one 95 | var portUsable = false; 96 | var port; 97 | while (!portUsable) { 98 | port = 1024 + parseInt(Math.random() * 40000); 99 | // port = 35556; 100 | portUsable = server.listen('127.0.0.1:' + port, workerRequest); 101 | } 102 | 103 | // output the port on the console, this will tell the master on which port he can speak to us 104 | console.log('#|#port#|#' + port + '#|#port#|#'); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "phantomjs-pool", 3 | "version": "0.3.2", 4 | "description": "Manage a Pool of PhantomJS instances and distribute jobs among the workers", 5 | "main": "lib/phantomjs-pool.js", 6 | "repository": { 7 | "type": "git", 8 | "url": "https://github.com/thomasdondorf/phantomjs-pool.git" 9 | }, 10 | "keywords": [ 11 | "phantomjs", 12 | "pool" 13 | ], 14 | "author": "Thomas Dondorf", 15 | "license": "MIT", 16 | "homepage": "https://github.com/thomasdondorf/phantomjs-pool" 17 | } --------------------------------------------------------------------------------