├── .gitignore
├── LICENSE.txt
├── README.md
├── examples
    ├── crawl-alexa-1m
    │   ├── createHar.js
    │   ├── master.js
    │   ├── top-1m-dummy.csv
    │   └── worker.js
    ├── message-passing
    │   ├── master.js
    │   └── worker.js
    ├── minimal
    │   ├── master.js
    │   └── worker.js
    └── simple-crawler
    │   ├── master.js
    │   └── worker.js
├── lib
    ├── master
    │   ├── Pool.js
    │   └── WorkerControl.js
    ├── phantomjs-pool.js
    └── worker
    │   └── Worker.js
└── package.json


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .idea
3 | examples/crawl-alexa-1m/results
4 | examples/crawl-alexa-1m/top-1m.csv


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Thomas Dondorf
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PhantomJS Pool Library
  2 | 
  3 | Create a pool of PhantomJS workers.
  4 | 
  5 | ## Install
  6 | 
  7 | `npm install phantomjs-pool`
  8 | 
  9 | Additionally, get the PhantomJS binaries (via `npm install phantomjs` or `npm install phantomjs2`)
 10 | or download the binary file yourself.
 11 | 
 12 | ## Usage
 13 | 
 14 | Check out the examples directory. Here is the minimal example, which saves screenshots of the Google search for the numbers from 0 to 9 with four workers.
 15 | 
 16 | #### master.js
 17 | 
 18 |     var Pool = require('phantomjs-pool').Pool;
 19 |     
 20 |     function jobCallback(job, worker, index) {
 21 |     
 22 |         if (index < 10) { // we just use the index as our data
 23 |             job(index, function(err) {
 24 |                 console.log('DONE: ' + index);
 25 |             });
 26 |         } else { // no more jobs
 27 |             job(null);
 28 |         }
 29 |     }
 30 |     
 31 |     var pool = new Pool({
 32 |         numWorkers : 4,
 33 |         jobCallback : jobCallback,
 34 |         workerFile : __dirname + '/worker.js' // location of the worker file (as an absolute path)
 35 |     });
 36 |     pool.start();
 37 | 
 38 | #### worker.js
 39 | 
 40 |     var webpage = require('webpage');
 41 |     
 42 |     module.exports = function(data, done, worker) {
 43 |         var page = webpage.create();
 44 |     
 45 |         // search for the given data (which contains the index number) and save it as screenshot
 46 |         page.open('https://www.google.com/search?q=' + data, function() {
 47 |             page.render('google' + data + '.png');
 48 |             done(null);
 49 |         });
 50 |     
 51 |     };
 52 | 
 53 | ## How does it work?
 54 | 
 55 | The master file (master.js in the example) is executed via Node.js and spawns multiple PhantomJS processes.
 56 | The PhantomJS process creates a server to communicate with the master process.
 57 | That way the data from the master is submitted to the worker.
 58 | The worker file (worker.js in the example) is embedded into the PhantomJS environment and given the data of the master process.
 59 | After executing the job, the worker can call the done function to signal that another job can be executed.
 60 | 
 61 | Some of the features of the library:
 62 | * Interoperability between Node.js (master) and PhantomJS (worker)
 63 | * Distribution of jobs between workers
 64 | * Simple error reporting, error handling and logging
 65 | * Restart of workers if necessary (due to [memory leaks](https://github.com/ariya/phantomjs/issues/11390))
 66 | * Recreation of workers if crashed (due to [segmentation fault](https://github.com/ariya/phantomjs/issues/13175))
 67 | * Restarts workers which are stuck (not calling the done function)
 68 | 
 69 | 
 70 | ## Documentation
 71 | 
 72 | ### Master (Pool)
 73 | 
 74 | Require the library to get access to `Pool`:
 75 | 
 76 |     var Pool = require('phantomjs-pool').Pool;
 77 | 
 78 | The constructor has the following options:
 79 | 
 80 | * `workerFile` -- This is the PhantomJS JavaScript file that contains the logic for the worker.
 81 | Make sure to use an absolute path or simply use `__dirname` followed by the path to your file.
 82 | * `jobCallback` -- Expects a function which is called each time a worker is ready to receive a job. This function is described in detail below.
 83 | * `phantomjsBinary` (optional) -- The path to the PhantomJS binary. You can leave this field if you have the `phantomjs` or `phantomjs2`
 84 |  npm module installed. If available, the library will use the `phantomjs` module. If this is not available it will use the `phantomjs2` module.
 85 | As an alternative you can simply download the binary yourself and use the property to specify the path.
 86 | * `numWorkers` (default: `2`) -- Number of PhantomJS workers used. This represents how many websites can be crawled simultaneously.
 87 | Depending on the system resources and available network throughput a value between 4 and 20 might be desirable.
 88 | * `spawnWorkerDelay` (default: `0`) -- Most of the time we do not want to spawn all workers at the same time,
 89 | as this would result in a network peak at the beginning.
 90 | The given number is interpreted as delay between the spawning of two workers.
 91 | If the value is set to 100, the first worker will spawn instantly, the second worker will spawn with a 100ms delay,
 92 | the third will spawn with a 200ms delay and so on.
 93 | * `phantomjsOptions` (default: `[]`) --  Expects an array containing command line arguments for the PhantomJS binary.
 94 | This can be used when using a proxy or another feature of PhantomJS that needs to be passed via command line.
 95 | Example: `["--proxy=127.0.0.1:8080", "--proxy-type=http"]`
 96 | * `verbose` (default: `false`) -- If the flag is set to true, the library outputs the communication between
 97 | master and worker and some additional information which might help resolve problems.
 98 | * `workerTimeout` (default: `120000` = 2min) -- This number represents the time in milliseconds a worker can work without giving feedback.
 99 | If a worker does not respond after that time, the process will be killed and the job will be marked as erroneous.
100 | 
101 | #### jobCallback
102 | 
103 | The provided `jobCallback` function is called each time a worker is ready to receive a job.
104 | The function is called with three arguments: `job`, `worker`, `index`
105 | * `job(data[, callback])` -- Is a function that expects two arguments. The first argument contains the data that will be send to the worker.
106 | This needs to be a valid JSON object (properties like functions will not be sent to the worker). The second argument is optional
107 | and can be used to provide a callback function which will be called when the job is executed
108 | (for simplicity, let's name the function `afterJobCallback`).
109 | The `afterJobCallback` function is called after the worker executed the job with an error and other information: `afterJobCallback(error, data)`
110 | The first parameter (`error`) contains `null` if the job was successfull or an object of type `Error`. To read the error message use `error.message`.
111 | The error can either be a library-specific error message, a PhantomJS error message or a message that has been declared by the worker script (via the error sent in the `done` function).
112 | The `data` object contains the data that is sent by the worker using the `done` function. If the worker did not send any data, `data` is undefined.
113 | * `worker` -- Contains information about the worker. Currently this is only the ID. Each worker gets an ID (starting at 0).
114 | * `index` -- The value is `0` for the first call of the `jobCallback` function and increments for each following job.
115 | This allows to make use of arrays in a very simple manner.
116 | 
117 | ### Worker
118 | 
119 | The exports object needs to be a single function, that will be called with three arguments: `data`, `done`, `worker`
120 | 
121 |     module.exports = function(data, done, worker) { /* ... */ }
122 | 
123 | #### data
124 | 
125 | The data object contains the data object that has been send via the `job` function in the `jobCallback` function.
126 | 
127 | #### done
128 | 
129 | The `done` function needs to be called by the script after the execution of the job.
130 | The first parameter can contain an error. The second parameter can contain additional information.
131 | 
132 | Examples:
133 | * `done()` -- The job has been executed successfully. No additional data is provided for the master.
134 | * `done(null, { foo : "bar" })` -- The job has been executed successfully. The additional data will be passed to the master. This can be any valid JSON object.
135 | See the `jobCallback` function to read where the data will be received.
136 | * `done(new Error("Crawl Error"))` -- An error happened during the execution. The error reason should be passed in the constructor.
137 | Additional information that is added to the error object will not be send to the master.
138 | Therefore, do not add additional properties to the error object. Use the second object to send additional data.
139 | * `done(new Error("Crawl Error"), { problem : "...", foo : [1,2,3] })` -- And error happened. The second object can again be used to send additional information.
140 | 
141 | 
142 | #### worker
143 | 
144 | The worker object contains information about the worker itself. Currently, this is only the ID of the worker.
145 | 
146 | * `id` -- ID of the worker, e.g. `worker.id` is `2` for the third worker (starting at zero).
147 | 
148 | ## License
149 | 
150 | MIT License.


--------------------------------------------------------------------------------
/examples/crawl-alexa-1m/createHar.js:
--------------------------------------------------------------------------------
 1 | // Based on https://gist.github.com/dongyuwei/3689928
 2 | 
 3 | if (!Date.prototype.toISOString) {
 4 |     Date.prototype.toISOString = function () {
 5 |         function pad(n) { return n < 10 ? '0' + n : n; }
 6 |         function ms(n) { return n < 10 ? '00'+ n : n < 100 ? '0' + n : n }
 7 |         return this.getFullYear() + '-' +
 8 |             pad(this.getMonth() + 1) + '-' +
 9 |             pad(this.getDate()) + 'T' +
10 |             pad(this.getHours()) + ':' +
11 |             pad(this.getMinutes()) + ':' +
12 |             pad(this.getSeconds()) + '.' +
13 |             ms(this.getMilliseconds()) + 'Z';
14 |     }
15 | }
16 | 
17 | function createHAR(address, title, startTime, endTime, resources) {
18 |     var entries = [];
19 | 
20 |     resources.forEach(function (resource) {
21 |         var request = resource.request,
22 |             startReply = resource.startReply,
23 |             endReply = resource.endReply;
24 | 
25 |         if (!request || !startReply || !endReply) {
26 |             return;
27 |         }
28 | 
29 |         // Exclude Data URI from HAR file because
30 |         // they aren't included in specification
31 |         if (request.url.substr(0, 5).toLowerCase() === 'data:') {
32 |             return;
33 |         }
34 | 
35 |         entries.push({
36 |             startedDateTime: request.time.toISOString(),
37 |             time: endReply.time - request.time,
38 |             request: {
39 |                 method: request.method,
40 |                 url: request.url,
41 |                 httpVersion: "HTTP/1.1",
42 |                 cookies: [],
43 |                 headers: request.headers,
44 |                 queryString: [],
45 |                 headersSize: -1,
46 |                 bodySize: -1
47 |             },
48 |             response: {
49 |                 status: endReply.status,
50 |                 statusText: endReply.statusText,
51 |                 httpVersion: "HTTP/1.1",
52 |                 cookies: [],
53 |                 headers: endReply.headers,
54 |                 redirectURL: "",
55 |                 headersSize: -1,
56 |                 bodySize: startReply.bodySize,
57 |                 content: {
58 |                     size: startReply.bodySize,
59 |                     mimeType: endReply.contentType
60 |                 }
61 |             },
62 |             cache: {},
63 |             timings: {
64 |                 blocked: 0,
65 |                 dns: -1,
66 |                 connect: -1,
67 |                 send: 0,
68 |                 wait: startReply.time - request.time,
69 |                 receive: endReply.time - startReply.time,
70 |                 ssl: -1
71 |             },
72 |             pageref: address
73 |         });
74 |     });
75 | 
76 |     return {
77 |         log: {
78 |             version: '1.2',
79 |             creator: {
80 |                 name: 'PhantomJS',
81 |                 version: phantom.version.major + '.' + phantom.version.minor + '.' + phantom.version.patch
82 |             },
83 |             pages: [{
84 |                 startedDateTime: startTime.toISOString(),
85 |                 id: address,
86 |                 title: title,
87 |                 pageTimings: {
88 |                     onLoad: endTime - startTime
89 |                 }
90 |             }],
91 |             entries: entries
92 |         }
93 |     };
94 | }
95 | 
96 | module.exports = createHAR;


--------------------------------------------------------------------------------
/examples/crawl-alexa-1m/master.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var fs = require('fs');
 3 | var Pool = require('../../lib/phantomjs-pool').Pool;
 4 | 
 5 | var ALEXA_FILE = 'atop-1m.csv';
 6 | var ALEXA_SIZE = 5; //1000000;
 7 | 
 8 | // Check if the Alexa Top 1 Million files was downloaded, otherwise we will simply use the dummy file
 9 | if (!fs.existsSync(ALEXA_FILE)) {
10 |     console.log('Please download and unzip the Alexa 1 Million file top-1m.csv and place it in this directory:');
11 |     console.log('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip');
12 | 
13 |     console.log('');
14 |     console.log('We will now continue using a dummy file that only contains 20 entries.');
15 | 
16 | 
17 |     ALEXA_FILE = 'top-1m-dummy.csv';
18 |     ALEXA_SIZE = 20;
19 | 
20 |     setTimeout(startCrawling, 2000);
21 | } else {
22 |     startCrawling();
23 | }
24 | 
25 | function startCrawling() {
26 |     console.log('Reading ' + ALEXA_FILE);
27 |     var lines = fs.readFileSync(ALEXA_FILE).toString().split('\n');
28 |     console.log(' - Done.');
29 | 
30 |     var total = 0;
31 |     var successful = 0;
32 | 
33 |     // Called when a worker is ready for a new job
34 |     // job is the function that needs to be called to execute the job
35 |     // index contains a number (starting at 0) that is increased with each jobCallback call
36 |     function jobCallback(job, worker, index) {
37 | 
38 |         if (index < ALEXA_SIZE) {
39 |             var line = lines[index].trim();
40 | 
41 |             var split = line.split(',');
42 |             var id = parseInt(split[0]);
43 |             var url = split[1];
44 |             job({
45 |                 id : id,
46 |                 url : url
47 |             }, function(err) {
48 |                 // Lets log if it worked
49 |                 total++;
50 |                 if (err) {
51 |                     console.log('Problem  #' + worker.id + ': ' + err.message + ' for line: ' + line);
52 |                 } else {
53 |                     console.log('    #' + worker.id + '  DONE: ' + line);
54 |                     successful++;
55 |                 }
56 |                 if (total % 10 === 0) {
57 |                     console.log('################## ' + successful + '/' + total + ' (success/crawled) ################################');
58 |                 }
59 |             });
60 |         } else {
61 |             // no more content!
62 |             job(null);
63 |         }
64 |     }
65 | 
66 |     var pool = new Pool({
67 |         numWorkers : 4,
68 |         // verbose : true, // enable if you want to see more
69 |         jobCallback : jobCallback,
70 |         workerFile : __dirname + '/worker.js' // location of our worker file (as an absolute path)
71 |     });
72 |     pool.start();
73 | }


--------------------------------------------------------------------------------
/examples/crawl-alexa-1m/top-1m-dummy.csv:
--------------------------------------------------------------------------------
 1 | 1,google.com
 2 | 2,facebook.com
 3 | 3,youtube.com
 4 | 4,baidu.com
 5 | 5,yahoo.com
 6 | 6,wikipedia.org
 7 | 7,amazon.com
 8 | 8,twitter.com
 9 | 9,taobao.com
10 | 10,qq.com
11 | 11,google.co.in
12 | 12,live.com
13 | 13,linkedin.com
14 | 14,sina.com.cn
15 | 15,weibo.com
16 | 16,yahoo.co.jp
17 | 17,tmall.com
18 | 18,google.co.jp
19 | 19,ebay.com
20 | 20,google.de


--------------------------------------------------------------------------------
/examples/crawl-alexa-1m/worker.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var webpage = require('webpage');
  3 | var fs = require('fs');
  4 | 
  5 | var createHar = require('./createHar');
  6 | 
  7 | module.exports = function(data, done, worker) {
  8 | 
  9 |     var page = webpage.create();
 10 |     page.clearCookies();
 11 |     page.clearMemoryCache();
 12 |     page.settings.resourceTimeout = 30000;
 13 | 
 14 |     var resources = [];
 15 |     var startTime = -1;
 16 |     var endTime = -1;
 17 | 
 18 |     var address = 'http://' + data.url;
 19 | 
 20 |     // PhantomJS own onLoadFinished event does not work always so we basically have to implement it on our own
 21 |     // For this we check the requests and as soon as for some time there are no outgoing requests (and all responses
 22 |     // have arrived) we assume the page is loaded
 23 | 
 24 |     // if after 100ms no other request is made we think the page is loaded
 25 |     var FINAL_TIMEOUT = 100;
 26 | 
 27 |     var finalCheckTimeout = null;
 28 |     var openRequests = 0;
 29 | 
 30 |     var isLoaded = false;
 31 |     function pageLoaded(status) {
 32 |         if (!isLoaded) {
 33 |             isLoaded = true;
 34 |             if (status !== 'success') {
 35 |                 done(new Error('Crawl Error: ' + page.reason + ' for ' + page.reason_url));
 36 |             } else {
 37 |                 logPage();
 38 |             }
 39 |         }
 40 |     }
 41 | 
 42 |     function logPage() {
 43 |         var endTime = new Date();
 44 |         var title = page.evaluate(function () {
 45 |             return document.title;
 46 |         });
 47 | 
 48 |         var har = createHar(address, title, startTime, endTime, resources);
 49 | 
 50 |         // we dont want to have 1m files in one directory, we would prefer to have 1m files divided into 1000 directories
 51 |         var dirId = parseInt(data.id / 1000)*1000;
 52 |         var fileName = __workerDirname + '/results/' + dirId + '/' + data.id + '-' + data.url.replace(/[^\w.,;+\-]/g, '_') + '.json';
 53 |         fs.write(fileName, JSON.stringify(har, null, 4), 'w');
 54 | 
 55 |         done();
 56 |     }
 57 | 
 58 | 
 59 | 
 60 | 
 61 |     page.onLoadStarted = function () {
 62 |         startTime = new Date();
 63 |     };
 64 | 
 65 |     page.onResourceRequested = function (req) {
 66 |         clearTimeout(finalCheckTimeout);
 67 |         resources[req.id] = {
 68 |             request: req,
 69 |             startReply: null,
 70 |             endReply: null
 71 |         };
 72 |         openRequests++;
 73 |     };
 74 | 
 75 |     page.onResourceReceived = function (res) {
 76 |         if (res.stage === 'start') {
 77 |             resources[res.id].startReply = res;
 78 |         } else if (res.stage === 'end') {
 79 |             resources[res.id].endReply = res;
 80 |             openRequests--;
 81 | 
 82 |             if (openRequests === 0) {
 83 |                 finalCheckTimeout = setTimeout(function() {
 84 |                     if (!isLoaded) {
 85 |                         console.log('ALTERNATIVE LOADING EVENT!')
 86 |                     }
 87 |                     pageLoaded('success'); // we assume everything is fine
 88 |                 }, FINAL_TIMEOUT);
 89 |             }
 90 |         }
 91 |     };
 92 | 
 93 |     page.onResourceError = function (resourceError) {
 94 |         page.reason = resourceError.errorString;
 95 |         page.reason_url = resourceError.url;
 96 |     };
 97 | 
 98 |     page.onError = function (msg, trace) {
 99 |         // we actually just ignore errors happening on the page itself
100 |     };
101 | 
102 |     page.open(address, pageLoaded);
103 | 
104 | };
105 | 


--------------------------------------------------------------------------------
/examples/message-passing/master.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Pool = require('../../lib/phantomjs-pool').Pool;
 3 | 
 4 | function jobCallback(job, worker, index) {
 5 |     job({
 6 |         index : index,
 7 |         moreData : "Hello World!"
 8 |     }, function(err, data) {
 9 |         if (err) {
10 |             console.log('We got an error for worker #' + err.workerId + ': ' + err.message);
11 |         } else {
12 |             console.log('I got data back from worker #' + data.workerId + ': ' + data.indexBack + ' (more data: ' + data.foo + ').');
13 |         }
14 |     });
15 | }
16 | 
17 | 
18 | var pool = new Pool({
19 |     numWorkers : 3,
20 |     jobCallback : jobCallback,
21 |     workerFile : __dirname + '/worker.js'
22 | });
23 | pool.start();


--------------------------------------------------------------------------------
/examples/message-passing/worker.js:
--------------------------------------------------------------------------------
 1 | 
 2 | module.exports = function(data, done, worker) {
 3 | 
 4 |     console.log('Logging works too! Yay!');
 5 | 
 6 |     if (worker.id === 1) {
 7 |         // let's create an error case everytime worker 1 does something
 8 |         setTimeout(function() {
 9 |             done(new Error('Error, I dont work for this worker!'));
10 |         }, 2000);
11 |     } else {
12 |         var data = {
13 |             workerId : worker.id,
14 |             indexBack : data.index, // this does not make any sense, but let's just send the index back
15 |             foo : 'Greetings, Friend!'
16 |         };
17 | 
18 |         // I let it look like we did some work...
19 |         setTimeout(function() {
20 |             done(null, data);
21 |         }, 5000 + Math.random());
22 | 
23 |     }
24 | };


--------------------------------------------------------------------------------
/examples/minimal/master.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Pool = require('../../lib/phantomjs-pool').Pool;
 3 | 
 4 | function jobCallback(job, worker, index) {
 5 | 
 6 |     if (index < 10) { // we just use the index as our data
 7 |         job(index, function(err) {
 8 |             console.log('DONE: ' + index);
 9 |         });
10 |     } else { // no more jobs
11 |         job(null);
12 |     }
13 | }
14 | 
15 | var pool = new Pool({
16 |     numWorkers : 4,
17 |     jobCallback : jobCallback,
18 |     workerFile : __dirname + '/worker.js' // location of our worker file (as an absolute path)
19 | });
20 | pool.start();


--------------------------------------------------------------------------------
/examples/minimal/worker.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var webpage = require('webpage');
 3 | 
 4 | module.exports = function(data, done, worker) {
 5 |     var page = webpage.create();
 6 | 
 7 |     // search for the given data (which contains the index number) and save it as screenshot
 8 |     page.open('https://www.google.com/search?q=' + data, function() {
 9 |         page.render('google' + data + '.png');
10 |         done(null);
11 |     });
12 | 
13 | };


--------------------------------------------------------------------------------
/examples/simple-crawler/master.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Pool = require('../../lib/phantomjs-pool').Pool;
 3 | 
 4 | var pages = [
 5 |     'http://www.google.com/',
 6 |     'http://www.example.com/',
 7 |     'http://www.stackoverflow.com/',
 8 |     'http://phantomjs.org/',
 9 |     'http://www.nodejs.org/',
10 |     'http://www.reddit.com/',
11 |     'http://www.youtube.com/',
12 |     'http://www.amazon.com/'
13 | ];
14 | 
15 | // Called when a worker is ready for a new job
16 | // job is the function that needs to be called to execute the job
17 | // index contains a number (starting at 0) that is increased with each jobCallback call
18 | function jobCallback(job, worker, index) {
19 | 
20 |     // as long as we have urls that we want to crawl we execute the job
21 |     var url = pages[index];
22 |     if (index < pages.length) {
23 | 
24 |         // the first argument contains the data which is passed to the worker
25 |         // the second argument is a callback which is called when the job is executed
26 |         job({
27 |             url : url,
28 |             id : index
29 |         }, function(err) {
30 |             // Lets log if it worked
31 |             if (err) {
32 |                 console.log('There were some problems for url ' + url + ': ' + err.message);
33 |             } else {
34 |                 console.log('DONE: ' + url + '(' + index + ')');
35 |             }
36 |         });
37 |     } else {
38 |         // if we have no more jobs, we call the function job with null
39 |         job(null);
40 |     }
41 | }
42 | 
43 | var pool = new Pool({
44 |     numWorkers : 3,
45 |     jobCallback : jobCallback,
46 |     workerFile : __dirname + '/worker.js' // location of our worker file (as an absolute path)
47 | });
48 | pool.start();


--------------------------------------------------------------------------------
/examples/simple-crawler/worker.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var webpage = require('webpage');
 3 | 
 4 | // worker needs to export one function which is called with the job
 5 | module.exports = function(data, done, worker) {
 6 | 
 7 |     // data contains the data we passed to the job function in the master file
 8 |     // done is a function which needs to be called to signal that the job is executed
 9 |     // worker contains some meta data about this worker (like the id)
10 | 
11 |     // we just fetch the page and save it as an image normally
12 |     var page = webpage.create();
13 |     page.open(data.url, function() {
14 |         page.render(data.id + '.png');
15 | 
16 |         // then we call the done function with null to signal we sucessfully executed the job
17 |         done(null);
18 |     });
19 | 
20 | };


--------------------------------------------------------------------------------
/lib/master/Pool.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Worker = require('./WorkerControl');
 3 | var fs = require('fs');
 4 | 
 5 | function Pool(options) {
 6 |     this.size = options.numWorkers || 2;
 7 |     this.spawnWorkerDelay = options.spawnWorkerDelay || 0;
 8 |     this.phantomjsOptions = options.phantomjsOptions || [];
 9 |     this.verbose = options.verbose || false;
10 |     this.workerTimeout = options.workerTimeout || 120 * 1000;
11 | 
12 |     this.jobIndex = 0;
13 | 
14 |     if (options.phantomjsBinary) {
15 |         this.phantomjsBinary = options.phantomjsBinary;
16 |     } else {
17 |         // Check if PhantomJS is installed
18 |         var phantomjsLib;
19 |         try {
20 |             phantomjsLib = require('phantomjs-prebuilt');
21 |         } catch (e) {} // Do nothing, we were just checking
22 |         try {
23 |             phantomjsLib = require('phantomjs');
24 |         } catch (e) {}
25 |         try {
26 |             phantomjsLib = require('phantomjs2');
27 |         } catch (e) {}
28 | 
29 |         if (phantomjsLib) {
30 |             this.phantomjsBinary = phantomjsLib.path;
31 |         } else {
32 |             throw new Error('PhantomJS binary not found. Use the option phantomjsBinary or install phantomjs via npm.');
33 |         }
34 |     }
35 | 
36 |     if (!options.workerFile) {
37 |         throw new Error('workerFile in options expected.');
38 |     }
39 |     this.workerFile = options.workerFile;
40 | 
41 |     this.jobCallback = options.jobCallback;
42 |     if (!options.jobCallback) {
43 |         throw new Error('jobCallback in options expected.');
44 |     }
45 | 
46 |     this.workers = [];
47 | }
48 | 
49 | // Adds workers until the pool size is reached
50 | Pool.prototype.spawnWorkers = function () {
51 |     var that = this;
52 |     if (this.size > this.workers.length) {
53 |         this.addWorker();
54 |         setTimeout(function () {
55 |             that.spawnWorkers();
56 |         }, this.spawnWorkerDelay);
57 |     }
58 | };
59 | 
60 | // adds one worker to the pool
61 | Pool.prototype.addWorker = function () {
62 |     if (this.verbose) {
63 |         console.log('Creating worker #' + this.workers.length);
64 |     }
65 |     this.workers.push(Worker.create(this));
66 | };
67 | 
68 | Pool.prototype.getJob = function(jobCallback, workerData) {
69 |     this.jobCallback(jobCallback, workerData, this.jobIndex);
70 |     this.jobIndex++;
71 | };
72 | 
73 | Pool.prototype.start = function () {
74 |     if (this.verbose) {
75 |         console.log('Starting spawning workers');
76 |     }
77 |     this.spawnWorkers();
78 | };
79 | 
80 | module.exports = Pool;


--------------------------------------------------------------------------------
/lib/master/WorkerControl.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var cp = require('child_process');
  3 | var http = require('http');
  4 | var querystring = require('querystring');
  5 | 
  6 | var phantomjsBinPath = '/../../bin/phantomjs';
  7 | 
  8 | var VERBOSE = false;
  9 | 
 10 | function log(workerId, msg) {
 11 |     if (VERBOSE) {
 12 |         console.log('    #' + workerId + ' ' + msg);
 13 |     }
 14 | }
 15 | 
 16 | function createError(workerId, msg) {
 17 |     var err = new Error(msg);
 18 |     err.workerId = workerId;
 19 |     return err;
 20 | }
 21 | 
 22 | // Number of current workers, to give new workers an id
 23 | var workerId = 0;
 24 | 
 25 | function Worker(pool) {
 26 |     this.id = workerId;
 27 |     workerId++;
 28 |     this.workerData = {
 29 |         id : this.id
 30 |     };
 31 |     this.pool = pool;
 32 |     this.createProcess();
 33 |     this.waitingTimeout = null;
 34 |     if (this.pool.verbose) {
 35 |         VERBOSE = true;
 36 |     }
 37 | 
 38 |     this.alive = true;
 39 | }
 40 | 
 41 | // Create process of PhantomJS worker
 42 | Worker.prototype.createProcess = function() {
 43 | 
 44 |     // first kill the old worker process if there ist still one
 45 |     if (this.proc) {
 46 |         log(this.id, 'killing worker');
 47 |         this.proc.kill();
 48 |     }
 49 | 
 50 |     var that = this;
 51 |     that.port = undefined;
 52 | 
 53 |     var clArgs = [__dirname + '/../../lib/worker/Worker.js', this.id, this.pool.workerFile];
 54 |     if (this.pool.phantomjsOptions) {
 55 |         clArgs.unshift.apply(clArgs, this.pool.phantomjsOptions);
 56 |     }
 57 | 
 58 |     // Spawn process
 59 |     this.proc = cp.spawn(that.pool.phantomjsBinary, clArgs, { cwd : process.cwd() });
 60 |     this.proc.on('error', function (err) {
 61 |         if (err.message.indexOf('ENOENT') !== -1) {
 62 |             throw new Error('phantomjsBinary not found: ' + that.pool.phantomjsBinary + ' (Full error: ' + err.message + ')');
 63 |         } else {
 64 |             throw new Error('Problem starting the PhantomJS process: ' + err.message);
 65 |         }
 66 |     });
 67 |     this.proc.stdout.on('data', function (rawData) {
 68 |         var data = rawData.toString();
 69 | 
 70 |         // parse first data from the worker and interpret it as port number or output it
 71 |         if (that.port === undefined && data.indexOf('#|#port#|#') !== -1) {
 72 |             var splitted = data.split('#|#port#|#');
 73 |             that.port = parseInt(splitted[1]);
 74 |             log(that.id, ' starting on port: ' + that.port);
 75 | 
 76 |             // we are now ready setup and can start working
 77 |             that.readyForWork();
 78 |         } else {
 79 |             // output logging calls of the custom worker of the user
 80 |             data.split('\n').forEach(function(line) {
 81 |                 if (line.trim().length !== 0) {
 82 |                     console.log('  #' + that.id + ' >> ' + line);
 83 |                 }
 84 |             });
 85 |         }
 86 |     });
 87 | 
 88 |     // This should not happen, but just in case, we log it...
 89 |     this.proc.stderr.on('data', function (data) {
 90 |         data.toString().split('\n').forEach(function(line) {
 91 |             if (line.trim().length !== 0) {
 92 |                 console.log('  #' + that.id + ' STDERR >> ' + line); // TODO: Write this into STDERR
 93 |             }
 94 |         });
 95 |     });
 96 | 
 97 |     // If the process is killed or closed we want to start another one
 98 |     this.proc.on('close', function (code, signal) {
 99 |         log(that.id, 'process closed');
100 |         clearTimeout(that.waitingTimeout); // remove timeout (which checks if worker is stuck) if we have one running
101 | 
102 |         // only do all that, if we did not close the process on our own
103 |         if (signal !== 'SIGTERM') {
104 |             // if we close the process on our own, we have already opened the next proc, so lets not set it to null
105 |             that.proc = null; // there is no process anymore attached to this worker
106 | 
107 |             // code == 0 -> means worker closed as expected after he crawled several websites
108 |             // (planned closing because of memory leak problems)
109 |             if (code !== 0) { // sigterm means we killed the worker on our own
110 |                 log(that.id, 'closed with error code ' + code + ', signal: ' + signal);
111 |                 // use callback to signal error
112 |                 if (that.currentJob && that.currentJob.callback) {
113 |                     that.currentJob.callback(createError(that.id, 'PhantomJS error, closing signal: ' + signal));
114 |                 }
115 |             }
116 | 
117 |             // if worker is still needed, restart process
118 |             if (that.alive) {
119 |                 log(that.id, 'recreating phantomjs instance');
120 |                 that.createProcess();
121 |             }
122 |         }
123 | 
124 |     });
125 | 
126 | };
127 | 
128 | 
129 | // called when the worker has no job and is ready to receive work
130 | Worker.prototype.readyForWork = function() {
131 |     if (this.currentJob) {
132 |         log(this.id, 'ignoring the last job: ' + JSON.stringify(this.currentJob.data));
133 |     }
134 | 
135 |     var that = this;
136 |     this.pool.getJob(function (data, doneCallback) {
137 |         if (data === null) { // no more data, we can close this worker
138 |             if (that.proc) {
139 |                 log(that.id, 'closing worker');
140 |                 that.proc.kill();
141 |             }
142 |             that.alive = false;
143 |         } else if (!that.alive) {
144 |             throw createError(that.id, 'Worker was already closed. You cannot reuse a closed worker!');
145 |         } else {
146 |             that.work(data, doneCallback);
147 |         }
148 |     }, this.workerData);
149 | };
150 | 
151 | // called by master -> contains a new job and a callback that should be called when the job is done or erroneous
152 | Worker.prototype.work = function(data, givenJobCallback) {
153 |     var that = this;
154 |     that.currentJob = {
155 |         data : data,
156 |         callback : givenJobCallback
157 |     };
158 |     log(this.id, 'new job ' + JSON.stringify(data));
159 | 
160 |     function jobCallback(err, data) {
161 |         if (givenJobCallback) {
162 |             givenJobCallback(err, data);
163 |         }
164 |     }
165 | 
166 |     // we will now send this job the the phantomJS instance via REST
167 |     // the phantomJS instance has a port opened for this which accepts REST calls
168 | 
169 |     // The data we want to submit via POST
170 |     var postData = querystring.stringify({
171 |         data : JSON.stringify(data)
172 |     });
173 | 
174 |     // parameters for the request
175 |     var options = {
176 |         hostname: '127.0.0.1',
177 |         port: this.port,
178 |         path: '/',
179 |         method: 'POST',
180 |         headers: {
181 |             'Content-Type': 'application/x-www-form-urlencoded',
182 |             'Content-Length': postData.length
183 |         }
184 |     };
185 | 
186 |     // start a timeout that kills the job and process if we do not receive an answer from the worker in time
187 |     that.waitingTimeout = setTimeout(function() {
188 |         log(that.id, 'worker seems to be dead, we got no response for ' + JSON.stringify(data) + ' / ' + (new Date()).toString());
189 |         jobCallback(createError(that.id, 'Worker Timeout'));
190 |         that.waitingTimeout = null;
191 |         workerRequest.abort();
192 | 
193 |         that.createProcess(); // this will kill the current running job and restart a new process
194 |     }, that.pool.workerTimeout);
195 | 
196 |     // the actual request
197 |     var workerRequest = http.request(options, function(res) {
198 |         var body = '';
199 |         res.on('data', function (chunk) {
200 |             body += chunk; // append chunks to get the whole body
201 |         });
202 | 
203 |         // we got our response, let's check what's in the box
204 |         res.on('end', function () {
205 |             if (that.waitingTimeout) {
206 |                 clearTimeout(that.waitingTimeout); // clear the "worker did not answer" timeout
207 |                 log(that.id, 'received result: ' + body);
208 |                 try {
209 |                     // parse results and pass them to our callback
210 |                     var result = JSON.parse(body);
211 |                 } catch (jsonParseError) {
212 |                     // if that happens, we are in trouble
213 |                     jobCallback(createError(that.id, 'JSON.parse error (content: ' + body + ')'));
214 |                     that.createProcess();
215 |                 }
216 |                 if (result.status === 'success') {
217 |                     jobCallback(null, result.data);
218 |                 } else if (result.status === 'fail') {
219 |                     jobCallback(createError(that.id, result.errMessage), result.data);
220 |                 } else {
221 |                     jobCallback(createError(that.id, 'Communication error between Master and Worker'));
222 |                     result.closing = true;
223 |                     that.createProcess();
224 |                 }
225 |                 that.currentJob = null;
226 | 
227 |                 // check if phatomjs instance will close down
228 |                 // if the worker signals he is closing, then we just wait for its closing
229 |                 // otherwise we get a job for the worker
230 |                 if (!result.closing) {
231 |                     that.readyForWork();
232 |                 }
233 |             }
234 |         });
235 |     });
236 | 
237 |     workerRequest.on('error', function(e) {
238 |         // should only happen if the worker somehow does not answer and we kill the process
239 |         log(that.id, 'problem with request: ' + e.message);
240 |     });
241 | 
242 |     // send request
243 |     workerRequest.write(postData);
244 |     workerRequest.end();
245 | };
246 | 
247 | 
248 | // factory for simplicity
249 | Worker.create = function(id, callback) {
250 |     return new Worker(id, callback);
251 | };
252 | 
253 | module.exports = Worker;


--------------------------------------------------------------------------------
/lib/phantomjs-pool.js:
--------------------------------------------------------------------------------
1 | 
2 | var Pool = require('./master/Pool');
3 | 
4 | module.exports = {
5 |     Pool : Pool
6 | };


--------------------------------------------------------------------------------
/lib/worker/Worker.js:
--------------------------------------------------------------------------------
  1 | var webserver = require('webserver');
  2 | var system = require('system');
  3 | 
  4 | // our workerId as assigned by the master
  5 | var workerId = parseInt(system.args[system.args.length-2]);
  6 | var workerData = {
  7 |     id : workerId
  8 | };
  9 | 
 10 | // location of the users worker file
 11 | var workerFile = system.args[system.args.length-1];
 12 | 
 13 | var customWorker;
 14 | (function() {
 15 |     // setup some helper variables the script can use
 16 |     var lastSlash = Math.max(workerFile.lastIndexOf('/'), workerFile.lastIndexOf('\\'));
 17 |     __workerDirname = workerFile.substr(0, lastSlash);
 18 |     __workerFilename = workerFile;
 19 |     customWorker = require(workerFile);
 20 | }());
 21 | 
 22 | 
 23 | // how many jobs to work before we restart // TODO this should be configurable
 24 | var REQUESTS_BEFORE_WORKER_RESTART = 30;
 25 | 
 26 | // count requests to close if the max number (above) is reached
 27 | var totalRequests = 0;
 28 | 
 29 | function workerRequest(req, res) {
 30 |     totalRequests++;
 31 | 
 32 |     // job was executed, lets inform the master
 33 |     function jobDone(err, data) {
 34 |         // TODO: check if function was already called before
 35 | 
 36 |         // check if we close the connection after this (to prevent memory leaks)
 37 |         var closing = totalRequests > REQUESTS_BEFORE_WORKER_RESTART ? true : false;
 38 | 
 39 |         var msg = {};
 40 | 
 41 |         if (err) {
 42 |             msg.errMessage = err.message;
 43 |             msg.status = 'fail';
 44 |             closing = true; // always close the worker if any error happens
 45 |         } else {
 46 |             msg.status = 'success';
 47 |         }
 48 |         msg.data = data;
 49 |         msg.closing = closing;
 50 | 
 51 |         // send our data back to the master
 52 |         res.statusCode = 200;
 53 |         res.write(JSON.stringify(msg));
 54 |         res.close();
 55 | 
 56 |         // close this worker if necessary
 57 |         if (closing) {
 58 |             phantom.exit();
 59 |         }
 60 |     }
 61 | 
 62 |     // contains our job data
 63 |     var data = req.post.data;
 64 |     var parsedData = JSON.parse(data);
 65 |     // we pass this to our customWorker
 66 |     if (data) {
 67 |         try {
 68 |             customWorker(parsedData, jobDone, workerData);
 69 |         } catch (e) {
 70 |             res.statusCode = 200;
 71 |             res.write(JSON.stringify({
 72 |                 errMessage : e.message,
 73 |                 status : 'fail',
 74 |                 closing : true
 75 |             }));
 76 |             res.close();
 77 |             phantom.exit();
 78 |         }
 79 |     } else {
 80 |         // sometimes the server seems to have problems receiving any data
 81 |         res.statusCode = 200;
 82 |         res.write(JSON.stringify({
 83 |             status : 'fail',
 84 |             data : 'No data for worker received ' + JSON.stringify(req.post)
 85 |         }));
 86 |         res.close();
 87 |     }
 88 | }
 89 | 
 90 | // we create a simple HTTP web server
 91 | var server = webserver.create();
 92 | 
 93 | // we want to find a port to open a REST server
 94 | // select port randomly until we find one
 95 | var portUsable = false;
 96 | var port;
 97 | while (!portUsable) {
 98 |     port = 1024 + parseInt(Math.random() * 40000);
 99 |     // port = 35556;
100 |     portUsable = server.listen('127.0.0.1:' + port, workerRequest);
101 | }
102 | 
103 | // output the port on the console, this will tell the master on which port he can speak to us
104 | console.log('#|#port#|#' + port + '#|#port#|#');


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "phantomjs-pool",
 3 |   "version": "0.3.2",
 4 |   "description": "Manage a Pool of PhantomJS instances and distribute jobs among the workers",
 5 |   "main": "lib/phantomjs-pool.js",
 6 |   "repository": {
 7 |     "type": "git",
 8 |     "url": "https://github.com/thomasdondorf/phantomjs-pool.git"
 9 |   },
10 |   "keywords": [
11 |     "phantomjs",
12 |     "pool"
13 |   ],
14 |   "author": "Thomas Dondorf",
15 |   "license": "MIT",
16 |   "homepage": "https://github.com/thomasdondorf/phantomjs-pool"
17 | }


--------------------------------------------------------------------------------