├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── lib ├── index.js └── watcher_metric.js ├── package.json └── tests └── test_watcher.js /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | coverage 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "0.10" 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, Yahoo! Inc. All rights reserved. 2 | 3 | Redistribution and use of this software in source and binary forms, 4 | with or without modification, are permitted provided that the following 5 | conditions are met: 6 | 7 | * Redistributions of source code must retain the above 8 | copyright notice, this list of conditions and the 9 | following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the 13 | following disclaimer in the documentation and/or other 14 | materials provided with the distribution. 15 | 16 | * Neither the name of Yahoo! Inc. nor the names of its 17 | contributors may be used to endorse or promote products 18 | derived from this software without specific prior 19 | written permission of Yahoo! Inc. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 22 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 24 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | process-watcher 2 | ======= 3 | It is responsible for: 4 | 5 | * Listen to all the nodejs processes. 6 | * Send SIGHUP to the process, which didn't have any throughput of events within 30 seconds. 7 | * Send SIGKILL to the process, which didn't die after getting SIGHUP withing 30 seconds. 8 | * Send SIGKILL to the process, which stopped sending status updates for 60 seconds. 9 | 10 | * Write following metrics for respective events described above. 11 | * watcher.proc.died: Incremented when process has died for any reason, potentially caused by SIGKILL or other means. 12 | * watcher.proc.graceful: Incremented if SIGHUP was sent to process. 13 | * watcher.proc.killed: Incremented when watcher sends the SIGKILL to the process. 14 | * watcher.reqcpu: Metric demonstrating an average of how many CPU jiffies per request process consumes. 15 | 16 | Note: process-watcher depends on monitr to provide status of worker process. For every worker, monitr needs to be 17 | started. For more details on monitr, please refer - https://github.com/yahoo/monitr/blob/master/README.md 18 | and for code example refer to https://github.com/yahoo/monitr/blob/master/examples/monitor_me.js. 19 | 20 | install 21 | ------- 22 | With npm do: 23 | 24 | `npm install process-watcher` 25 | 26 | usage 27 | ----- 28 | 29 | ```javascript 30 | var watcher = require('process-watcher'); 31 | var watcher_instance = new watcher.Watcher({ metric : watcher_metric, config : watcher_config }); 32 | ``` 33 | 34 | example 35 | ------- 36 | 37 | ```javascript 38 | var watcher = require('process-watcher'); 39 | 40 | /* 41 | * Dummy metric monitoring object. 42 | */ 43 | var watcher_metric = { 44 | /** 45 | * Increments metric 46 | */ 47 | increment : function (name, v) { 48 | // Add implementation as necessary 49 | }, 50 | /** 51 | * Set the metric or multiple metrics at the same time. 52 | * */ 53 | set : function (names, v) { 54 | // Add implementation as necessary 55 | } 56 | }; 57 | 58 | var dgpath = '/tmp/watcher_test_dgram', 59 | statusPath = '/tmp/watcher_status_path_test', 60 | watcher_config = { max_inactive : 0.001, monitor : 0.001, monPath: dgpath, 61 | timeout : 30, timeout_start : 60 }; 62 | 63 | //Instantiate watcher 64 | var watcher_instance = new watcher.Watcher({ metric : watcher_metric, config : watcher_config }); 65 | ``` 66 | 67 | Build Status 68 | ------------ 69 | 70 | [![Build Status](https://api.travis-ci.org/yahoo/process-watcher.png?branch=master)](http://travis-ci.org/yahoo/process-watcher) 71 | 72 | Node Badge 73 | ---------- 74 | 75 | [![NPM](https://nodei.co/npm/process-watcher.png)](https://nodei.co/npm/process-watcher/) 76 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013, Yahoo! Inc. All rights reserved. 3 | * Copyrights licensed under the New BSD License. 4 | * See the accompanying LICENSE file for terms. 5 | */ 6 | var dgram = require('unix-dgram'), 7 | fs = require('fs'), 8 | net = require('net'), 9 | mkdirp = require('mkdirp'), 10 | spawn = require('child_process').spawn, 11 | path = require('path'), 12 | monitor, 13 | ENOENT = require('constants').ENOENT, 14 | statuses = { }, 15 | watcher, 16 | wconfig = { }; 17 | 18 | function getFormattedDate() { 19 | return "[" + new Date().toISOString().substring(0, 19).replace('T', ' ') + "] "; 20 | } 21 | 22 | var clog = console.log; 23 | var cerrlog = console.error; 24 | console.log = function (msg) { 25 | clog.call(this, getFormattedDate() + msg); 26 | }; 27 | console.error = function (msg) { 28 | cerrlog.call(this, getFormattedDate() + msg); 29 | }; 30 | 31 | 32 | if (delete process._nm) { 33 | monitor = require('monitr'); 34 | } else { 35 | process._nm.disableReport(); 36 | } 37 | 38 | /** 39 | * Watcher class 40 | */ 41 | function Watcher(opts, healthChangeCallback) { 42 | wconfig = opts.config || {}; 43 | var deadProcTime = opts.max_inactive || wconfig.max_inactive || 60, 44 | monitorTime = opts.monitor || wconfig.monitor || 30, 45 | _monPath = opts.monPath || wconfig.monPath || (monitor ? monitor.ipcMonitorPath : process._nm.ipcMonitorPath), 46 | _statusPath = opts.socket_path || wconfig.socket_path || "/tmp/watcher.sock", 47 | self = this, 48 | monitorSocket, 49 | server, 50 | um, 51 | wmetric; 52 | 53 | // set timeouts 54 | self.maxTimeout = parseInt(opts.timeout || wconfig.timeout, 10) || 30000; 55 | self.maxStartTimeout = parseInt(opts.timeout_start || wconfig.timeout_start, 10) || 30000; 56 | self.healthChangeCallback = healthChangeCallback; 57 | self.maxOpenRequests = 0; 58 | if (self.healthChangeCallback) { 59 | //if callback is there, call it, so the initial health param is initialized 60 | self.healthChangeCallback(self); 61 | } 62 | if (self.overall_health_is_down === undefined) { 63 | self.overall_health_is_down = false; 64 | } 65 | 66 | // open up a unix socket to get process status messages 67 | monitorSocket = dgram.createSocket('unix_dgram'); 68 | 69 | if (opts.metric) { 70 | wmetric = opts.metric; 71 | } else { 72 | wmetric = require('./watcher_metric.js'); 73 | } 74 | 75 | self._monitorSocket = monitorSocket; 76 | self._metric = wmetric; 77 | monitorSocket.on('message', function (msg, rinfo) { 78 | self.onmessage(msg, rinfo, wmetric); 79 | }); 80 | // unlink the file associated with the datagram socket 81 | fs.unlink(_monPath, function () { 82 | 83 | // get a directory and set a umask 84 | var dir = path.dirname(_monPath), 85 | um = process.umask(0); 86 | 87 | try { 88 | mkdirp.sync(dir, 511); //0777 89 | } catch (ex) { 90 | console.log("ERROR: Failed to create directory for socket " + ex.stack); 91 | } 92 | 93 | // start listening 94 | monitorSocket.bind(_monPath); 95 | setTimeout(function () { 96 | try { 97 | fs.chmodSync(_monPath, 511); //0777 98 | } catch (e) { 99 | console.log("ERROR: Could not change mod for Socket" + e.stack); 100 | } 101 | }, 500); 102 | process.umask(um); 103 | }); 104 | // Setup a tcp server 105 | server = net.createServer(function (socket) { 106 | // Every time someone connects, tell them hello and then close the 107 | // connection. 108 | try { 109 | // return processes information 110 | socket.end(JSON.stringify(statuses)); 111 | } catch (e) { 112 | console.error("Failed to send response"); 113 | } 114 | }); 115 | // set umask 116 | um = process.umask(0); 117 | 118 | fs.unlink(_statusPath, function () { 119 | 120 | // start listening 121 | server.listen(_statusPath, function () { 122 | try { 123 | fs.chmodSync(_statusPath, 511); //0777 124 | } catch (e) { 125 | console.log("ERROR: Could not change mod for Socket" + e.stack); 126 | } 127 | process.umask(um); 128 | }); 129 | }); 130 | 131 | this._server = server; 132 | 133 | // setup intervals for checking status and inactivity 134 | self.setupIntervals(deadProcTime, monitorTime, wmetric); 135 | } 136 | 137 | Watcher.prototype = { 138 | closeStatusService: function () { 139 | try { 140 | this._server.close(); 141 | } catch (e) { 142 | console.log("Error happened while: " + e.stack + " while closing status server (can be ignored)"); 143 | } 144 | }, 145 | setupIntervals : function (deadProcTime, monitorTime, wmetric) { 146 | var self = this; 147 | 148 | // Set interval to check for 149 | // process which have died already 150 | this._inactivityInt = setInterval(function () { 151 | self.checkInactivity(wmetric); 152 | }, deadProcTime * 1000); 153 | // Set interval for monitoring 154 | this._monitorInt = setInterval(function () { 155 | self.checkStatus(wmetric); 156 | }, monitorTime * 1000); 157 | }, 158 | onmessage : function (msg, rinfo, wmetric) { 159 | // received message from process 160 | var info, 161 | changeInHealth = true, 162 | totalOpenRequests = 0, 163 | i; 164 | try { 165 | info = JSON.parse(msg.toString()); 166 | } catch (e) { 167 | console.log('ERROR: Got JSON with broken payload'); 168 | } 169 | if (!info || !info.status || !info.status.pid) { 170 | return; 171 | } 172 | 173 | // update timestamp to the one used by watcher 174 | info.status.wts = Date.now() / 1000; 175 | 176 | // If status contains that PID already 177 | // Update the last CPU usage through the average between current 178 | // and previous CPU usage level. 179 | // Then do the same for event per seconds metric 180 | if (statuses[info.status.pid]) { 181 | 182 | if (info.status.health_status_timestamp && 183 | statuses[info.status.pid].curr.health_status_timestamp && 184 | statuses[info.status.pid].curr.health_is_down === info.status.health_is_down) { 185 | changeInHealth = false; 186 | } 187 | 188 | statuses[info.status.pid].curr = info.status; 189 | statuses[info.status.pid].last.cpu = 190 | (statuses[info.status.pid].curr.cpu + 191 | statuses[info.status.pid].last.cpu) / 2; 192 | 193 | statuses[info.status.pid].last.events = 194 | (statuses[info.status.pid].curr.events + 195 | statuses[info.status.pid].last.events) / 2; 196 | 197 | statuses[info.status.pid].debug = info.status.debug; 198 | statuses[info.status.pid].oreqs = info.status.oreqs || 0; 199 | 200 | // If process has not been registered yet 201 | // Regiter it in the statuses struct under its PID. 202 | } else { 203 | // create new entry 204 | statuses[info.status.pid] = { }; 205 | statuses[info.status.pid].last = info.status; 206 | statuses[info.status.pid].kill = false; 207 | statuses[info.status.pid].curr = info.status; 208 | statuses[info.status.pid].debug = 0; 209 | statuses[info.status.pid].oreqs = info.status.oreqs || 0; 210 | } 211 | 212 | // if process has started listening we will change timeout for processing 213 | if (info.status.reqstotal > 0) { 214 | statuses[info.status.pid].listen = true; 215 | } 216 | 217 | // update the CPU per request metric 218 | if (info.status.cpuperreq !== undefined) { 219 | wmetric.set({"watcher.proc.cpureq" : info.status.cpuperreq, 220 | "watcher.proc.jiffyreq" : info.status.jiffyperreq}); 221 | } 222 | if (info.status.health_status_timestamp && changeInHealth && 223 | info.status.health_is_down !== this.overall_health_is_down) { 224 | //Change in health of this pid(worker) and the worker health is different from overall health 225 | if (info.status.health_is_down) { 226 | //this worker is down and overall health is up, so change it to down 227 | this.overall_health_is_down = info.status.health_is_down; 228 | if (this.healthChangeCallback) { 229 | this.healthChangeCallback(this); 230 | } 231 | } else { 232 | /*worker health changes from down to up, check if any other worker is down 233 | if not change the overall health to up else do nothing*/ 234 | if (!this.isAnyWorkerDown()) { 235 | this.overall_health_is_down = info.status.health_is_down; 236 | if (this.healthChangeCallback) { 237 | this.healthChangeCallback(this); 238 | } 239 | } 240 | } 241 | } 242 | for (i in statuses) { 243 | if (statuses[i].oreqs) { 244 | totalOpenRequests += statuses[i].oreqs; 245 | } 246 | } 247 | wmetric.set({ "watcher.proc.openreqs" : totalOpenRequests }, 0); 248 | if (this.maxOpenRequests < totalOpenRequests) { 249 | wmetric.set({ "watcher.proc.maxopenreqs" : totalOpenRequests }, 0); 250 | this.maxOpenRequests = totalOpenRequests; 251 | } 252 | }, 253 | checkStatus : function (wmetric) { 254 | var i, 255 | self = this, 256 | maxTimeout; 257 | for (i in statuses) { 258 | 259 | maxTimeout = statuses[i].listen ? self.maxTimeout : self.maxStartTimeout; 260 | /* 261 | * This condition descrbes the potential endless loop or extremely 262 | * slow event processing in Javascript, where the time elapsed between the artificialy 263 | * fed in event and its execution exceeds N seconds and there are not 264 | * many events (<2) have been processed in one second. 265 | * 266 | * This can however be due to a total overload of the system, where the process itself 267 | * is not getting scheduled for a long time. 268 | */ 269 | if (!statuses[i].debug && statuses[i].curr.elapsed >= maxTimeout && 270 | statuses[i].curr.events <= 2.0) { //&& statuses[i].last.events <= 1.0 271 | if (statuses[i].kill) { 272 | 273 | console.log('Sending SIGKILL due endless loop suspect ' + i); 274 | try { 275 | process.kill(i, 'SIGKILL'); 276 | wmetric.increment("watcher.proc.killed"); 277 | } catch (e) { 278 | } 279 | 280 | delete statuses[i]; 281 | this.postDelete(i); 282 | } else { 283 | 284 | console.log('Sending SIGHUP to ' + i); 285 | try { 286 | process.kill(i, 'SIGHUP'); 287 | wmetric.increment("watcher.proc.graceful"); 288 | } catch (ex) { 289 | } 290 | 291 | statuses[i].kill = true; 292 | } 293 | } else { 294 | statuses[i].kill = false; 295 | } 296 | } 297 | }, 298 | /* 299 | * Verify if process has died, report the metric and 300 | * Remove it from the table. 301 | **/ 302 | removeIfNotRunning : function (pid, wmetric) { 303 | try { 304 | fs.statSync('/proc/' + pid + '/stat'); 305 | } catch (e) { 306 | wmetric.increment("watcher.proc.died"); 307 | delete statuses[pid]; 308 | console.log("REMOVED FROM table " + pid); 309 | this.postDelete(pid); 310 | } 311 | }, 312 | checkInactivity : function (wmetric) { 313 | var i, now, timeout, self = this; 314 | for (i in statuses) { 315 | this.removeIfNotRunning(i, wmetric); 316 | } 317 | 318 | // check those, from which we didn't really get a signal 319 | now = (new Date()).getTime() / 1000; 320 | for (i in statuses) { 321 | 322 | timeout = (statuses[i].listen ? self.maxTimeout : self.maxStartTimeout) + 30000; 323 | if (now - statuses[i].curr.wts >= timeout) { 324 | console.log('Sending SIGKILL due to ' + timeout + ' sec inactivity to ' + i); 325 | try { 326 | process.kill(i, 'SIGKILL'); 327 | } catch (e) { 328 | } 329 | wmetric.increment("watcher.proc.killed"); 330 | delete statuses[i]; 331 | this.postDelete(i); 332 | } 333 | } 334 | }, 335 | postDelete : function (pid) { 336 | //if health is up already, nothing to do 337 | if (this.overall_health_is_down && !this.isAnyWorkerDown()) { 338 | //health is down, but none of the workers are down 339 | this.overall_health_is_down = false; 340 | if (this.healthChangeCallback) { 341 | this.healthChangeCallback(this); 342 | } 343 | } 344 | }, 345 | isAnyWorkerDown : function () { 346 | var i; 347 | for (i in statuses) { 348 | if (statuses[i].curr.health_status_timestamp && statuses[i].curr.health_is_down) { 349 | return true; 350 | } 351 | } 352 | return false; 353 | } 354 | 355 | }; 356 | 357 | if (!module.parent) { 358 | watcher = new Watcher({}); 359 | } else { 360 | module.exports.Watcher = Watcher; 361 | module.exports.statuses = statuses; 362 | } 363 | -------------------------------------------------------------------------------- /lib/watcher_metric.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013, Yahoo! Inc. All rights reserved. 3 | * Copyrights licensed under the New BSD License. 4 | * See the accompanying LICENSE file for terms. 5 | */ 6 | /* 7 | * Dummy metric monitoring object. 8 | */ 9 | 10 | var WMetric = { 11 | /** 12 | * Increments metric 13 | */ 14 | increment : function (name, v) { 15 | // Add implementation as necessary 16 | }, 17 | /** 18 | * Set the metric or multiple metrics at the same time. 19 | * */ 20 | set : function (names, v) { 21 | // Add implementation as necessary 22 | } 23 | }; 24 | 25 | module.exports = WMetric; 26 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "process-watcher", 3 | "description": "Monitors nodejs processes for endless loops and other abnormal behaviours and kills it.", 4 | "author": "Abhinav Raj ", 5 | "version": "0.0.2", 6 | "dependencies": { 7 | "unix-dgram": ">=0.0.3", 8 | "monitr": "*", 9 | "mkdirp": ">=0.3.5" 10 | }, 11 | "devDependencies": { 12 | "ytestrunner": "*", 13 | "yuitest": "*", 14 | "jshint": "~0.9.0", 15 | "istanbul": "~0.1.27" 16 | }, 17 | "main": "./lib/index.js", 18 | "scripts": { 19 | "pretest": "jshint --config ./node_modules/yui-lint/jshint.json ./lib/ ./tests/", 20 | "test": "istanbul cover --print=both --yui ytestrunner -- --include ./tests/options.js --include ./tests/builder.js --include ./tests/parser.js --include ./tests/parser_coffee.js --include ./tests/test_watcher.js" 21 | }, 22 | "bugs": { "url" : "http://github.com/yahoo/process-watcher/issues" }, 23 | "licenses":[ 24 | { 25 | "type" : "BSD", 26 | "url" : "https://github.com/yahoo/process-watcher/blob/master/LICENSE" 27 | } 28 | ], 29 | "repository": { 30 | "type":"git", 31 | "url":"http://github.com/yahoo/process-watcher.git" 32 | }, 33 | "engines": { 34 | "node": ">0.8.x" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/test_watcher.js: -------------------------------------------------------------------------------- 1 | var YUITest = require('yuitest').YUITest, 2 | Assert = YUITest.Assert, 3 | suite = new YUITest.TestSuite("Unit"), 4 | fs = require('fs'), 5 | watcher = require(".."), 6 | dgram = require('unix-dgram'), 7 | dgpath = '/tmp/watcher_test_dgram', 8 | pid = process.pid, 9 | statusPath = '/tmp/watcher_status_path_test'; 10 | 11 | YUITest.TestRunner.add(suite); 12 | 13 | function getMessage(cpu, ts, elapsed, oreqs, proc) { 14 | proc = proc || pid; 15 | return JSON.stringify({ 16 | status: 17 | { 18 | user_cpu: 0, 19 | pid: pid, 20 | mem: 0.66, 21 | cpu: cpu, 22 | elapsed: elapsed, 23 | events: 0, 24 | cluster: 32768, 25 | cpuperreq: 0.23, 26 | sys_cpu: 0, 27 | ts: ts, 28 | oreqs: oreqs || 0, 29 | title: '/branches/v0.4/build/default/node' 30 | } 31 | } 32 | ); 33 | } 34 | 35 | function getMessageWithHealth(custom_pid, cpu, ts, elapsed, proc, hts, isDown) { 36 | proc = proc || pid; 37 | return JSON.stringify({ 38 | status: 39 | { 40 | user_cpu: 0, 41 | pid: custom_pid, 42 | mem: 0.66, 43 | cpu: cpu, 44 | elapsed: elapsed, 45 | events: 0, 46 | cluster: 32768, 47 | cpuperreq: 0.23, 48 | sys_cpu: 0, 49 | ts: ts, 50 | title: '/branches/v0.4/build/default/node', 51 | health_status_timestamp : hts, 52 | health_is_down : isDown 53 | } 54 | }); 55 | } 56 | 57 | var healthChangeCounter = 0; 58 | function healthChange(obj) { 59 | Assert.isTrue(obj instanceof watcher.Watcher); 60 | ++healthChangeCounter; 61 | } 62 | var healthChangeCalled = 0; 63 | 64 | var lastKill = '', lastKillPID = 0; 65 | 66 | process.kill = function(pid, sig) { 67 | lastKill = sig; 68 | lastKillPID = pid; 69 | console.log("KILL called " + sig); 70 | }; 71 | 72 | fs.unlink(dgpath); 73 | fs.unlink(statusPath); 74 | 75 | var testee; 76 | 77 | suite.add(new YUITest.TestCase({ 78 | name : "Watcher Test", 79 | 'Test Watcher with no health change callback' : function() { 80 | var test_watcher = new watcher.Watcher({ config : { max_inactive : 0.001, monitor : 0.001, 81 | monPath: dgpath, statusPath: statusPath }}); 82 | Assert.isNotNull(test_watcher); 83 | Assert.areEqual(healthChangeCalled, healthChangeCounter); //0 84 | }, 85 | 'Test Watcher with health change callback' : function() { 86 | testee = new watcher.Watcher({ max_inactive : 0.001, monitor : 0.001, 87 | monPath: dgpath, statusPath: statusPath }, healthChange); 88 | Assert.isNotNull(testee); 89 | Assert.areEqual(++healthChangeCalled, healthChangeCounter); //0 90 | }, 91 | 'Test Timer' : function() { 92 | this.wait(function() { 93 | clearInterval(testee._inactivityInt); 94 | clearInterval(testee._monitorInt); 95 | Assert.isTrue(true); 96 | }, 100); 97 | }, 98 | 99 | 'Verify status OK is returned' : function() { 100 | var message = new Buffer(getMessage(0, Date.now(), 0, 10)), 101 | self = this; 102 | 103 | self.wait(function() { 104 | var client = dgram.createSocket("unix_dgram"); 105 | client.send(message, 0, message.length, dgpath, function (err, bytes) { 106 | if (err) { 107 | console.log("Message Error " + (err.stack || err.toString())); 108 | Assert.isTrue(false); 109 | } else { 110 | self.wait(function(){ 111 | console.log(require('util').inspect(watcher.statuses, true, 10)); 112 | Assert.areEqual(watcher.statuses[pid].curr.cpu, 0); 113 | Assert.areEqual(watcher.statuses[pid].last.cpu, 0); 114 | Assert.areEqual(watcher.statuses[pid].oreqs, 10); 115 | 116 | // do subsequent message, which will update statistics 117 | message = new Buffer(getMessage(50, Date.now(), 0, 20)); 118 | client.send(message, 0, message.length, dgpath, function (err, bytes) { 119 | Assert.isTrue(!err); 120 | self.wait(function() { 121 | Assert.areEqual(watcher.statuses[pid].last.cpu, 25); 122 | Assert.areEqual(watcher.statuses[pid].curr.cpu, 50); 123 | Assert.areEqual(watcher.statuses[pid].oreqs, 20); 124 | //testee._monitorSocket.close(); 125 | client.close(); 126 | }, 100); 127 | }); 128 | }, 500); 129 | console.log("Wrote " + bytes + " bytes to socket."); 130 | } 131 | }); 132 | }, 300); 133 | }, 134 | 'Verify health callback is called' : function() { 135 | var ts = Date.now(), 136 | message, 137 | self = this, 138 | other_pid = 12345; 139 | Assert.areEqual(false, testee.overall_health_is_down, 'initially down should be false'); 140 | message = new Buffer(getMessageWithHealth(pid, 0, Date.now(), 0, null, ts, true)), 141 | self.wait(function() { 142 | var client = dgram.createSocket("unix_dgram"); 143 | client.send(message, 0, message.length, dgpath, function (err, bytes) { 144 | if (err) { 145 | console.log("Message Error " + (err.stack || err.toString())); 146 | Assert.isTrue(false); 147 | } else { 148 | self.wait(function(){ 149 | Assert.areEqual(true, testee.overall_health_is_down, 'health should be down now'); 150 | Assert.areEqual(++healthChangeCalled, healthChangeCounter, 'should be 2 now'); 151 | // other worker sends down 152 | message = new Buffer(getMessageWithHealth(other_pid, 50, Date.now(), 0, other_pid, ts + 1000, true)); 153 | client.send(message, 0, message.length, dgpath, function (err, bytes) { 154 | Assert.isTrue(!err); 155 | self.wait(function() { 156 | Assert.areEqual(true, testee.overall_health_is_down, 'health should still be down'); 157 | Assert.areEqual(healthChangeCalled, healthChangeCounter, 'counter should be still 2'); 158 | 159 | //do subsequent message other pid with up 160 | message = new Buffer(getMessageWithHealth(other_pid, 50, Date.now(), 0, null, ts + 2000, false)); 161 | client.send(message, 0, message.length, dgpath, function (err, bytes) { 162 | Assert.isTrue(!err); 163 | self.wait(function() { 164 | Assert.areEqual(true, testee.overall_health_is_down, 'health should still be down as one worker is down'); 165 | Assert.areEqual(healthChangeCalled, healthChangeCounter, 'no change in health'); 166 | message = new Buffer(getMessageWithHealth(pid, 50, Date.now(), 0, other_pid, ts + 1200, false)); 167 | client.send(message, 0, message.length, dgpath, function (err, bytes) { 168 | Assert.isTrue(!err); 169 | self.wait(function() { 170 | Assert.areEqual(false, testee.overall_health_is_down, 'both worker is up'); 171 | Assert.areEqual(++healthChangeCalled, healthChangeCounter, 'should be 3 now'); //health change counter shud remain the same 172 | delete watcher.statuses[other_pid]; //remove the entry for other_pid 173 | testee._monitorSocket.close(); 174 | client.close(); 175 | }, 100); 176 | }); 177 | }, 100); 178 | }); 179 | }, 100); 180 | }); 181 | }, 500); 182 | console.log("Wrote " + bytes + " bytes to socket."); 183 | } 184 | }); 185 | }, 300); 186 | }, 187 | 'Test checkStatus' :function() { 188 | var sb = { 189 | increment : function(x) { 190 | this[x] = true; 191 | } 192 | }; 193 | 194 | // This should trigger the SIGHUB 195 | watcher.statuses.aaa = { 196 | curr : { 197 | elapsed : 70000, 198 | events: 1 199 | } 200 | }; 201 | 202 | testee.checkStatus(sb); 203 | Assert.isTrue(sb["watcher.proc.graceful"]); 204 | 205 | testee.checkStatus(sb); 206 | Assert.isTrue(sb["watcher.proc.killed"]); 207 | }, 208 | 209 | 'Test inactivity' :function(f) { 210 | watcher.statuses.xyz = {}; 211 | testee.checkInactivity(testee._metric); 212 | Assert.areEqual(watcher.statuses.xyz, undefined); 213 | var temp = watcher.statuses[pid]; 214 | 215 | watcher.statuses[pid] = { 216 | curr : { 217 | wts : 1000 218 | } 219 | }; 220 | 221 | lastKill = ''; 222 | lastKillPID = 0; 223 | testee.checkInactivity(testee._metric); 224 | 225 | // Verify proc is killed 226 | Assert.areEqual(lastKill, 'SIGKILL'); 227 | Assert.areEqual(lastKillPID, pid); 228 | watcher.statuses[pid] = temp; //restore pid status 229 | }, 230 | 'Test inactivity and health check reset' :function(f) { 231 | var ts = Date.now(); 232 | watcher.statuses['12345'] = {}; 233 | testee.overall_health_is_down = true; 234 | watcher.statuses[pid].curr.health_is_down = true; 235 | 236 | testee.checkInactivity(testee._metric); 237 | Assert.areEqual(watcher.statuses['12345'], undefined); 238 | Assert.areEqual(true, testee.overall_health_is_down); 239 | 240 | 241 | watcher.statuses[pid] = { 242 | curr : { 243 | wts : 1000 244 | } 245 | }; 246 | 247 | lastKill = ''; 248 | lastKillPID = 0; 249 | testee.checkInactivity(testee._metric); 250 | 251 | // Verify proc is killed 252 | Assert.areEqual(lastKill, 'SIGKILL'); 253 | Assert.areEqual(lastKillPID, pid); 254 | Assert.areEqual(false, testee.overall_health_is_down); 255 | }, 256 | 257 | 'Clean up - should run as the last' : function() { 258 | var ex = null; 259 | try { 260 | testee.closeStatusService(); 261 | } catch (e) { 262 | ex = e; 263 | } 264 | Assert.areEqual(ex, null); 265 | } 266 | })); 267 | --------------------------------------------------------------------------------