├── .gitignore
├── lockrun.c
└── README.markdown


/.gitignore:
--------------------------------------------------------------------------------
1 | lockrun
2 | 


--------------------------------------------------------------------------------
/lockrun.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * See README.markdown for build, install, and usage instructions.
  3 |  */
  4 | 
  5 | #include <stdio.h>
  6 | #include <stdarg.h>
  7 | #include <errno.h>
  8 | #include <stdlib.h>
  9 | #include <string.h>
 10 | #include <fcntl.h>
 11 | #include <unistd.h>
 12 | #include <time.h>
 13 | #include <sys/types.h>
 14 | #include <sys/wait.h>
 15 | #include <sys/file.h>
 16 | 
 17 | #ifndef __GNUC__
 18 | # define __attribute__(x)	/* nothing */
 19 | #endif
 20 | 
 21 | 
 22 | #define STRMATCH(a,b)		(strcmp((a),(b)) == 0)
 23 | 
 24 | #define UNUSED_PARAMETER(v)	((void)(v))
 25 | 
 26 | #define	TRUE	1
 27 | #define	FALSE	0
 28 | 
 29 | static const char	*lockfile = 0;
 30 | static int		wait_for_lock = FALSE;
 31 | static mode_t		openmode = 0666;
 32 | static int		sleeptime = 10;		/* seconds */
 33 | static int		Verbose = FALSE;
 34 | static int		Maxtime  = 0;
 35 | static int		idempotent = FALSE;
 36 | 
 37 | static char *getarg(char *opt, char ***pargv);
 38 | 
 39 | static void die(const char *format, ...)
 40 | 		__attribute__((noreturn))
 41 | 		__attribute__((format(printf, 1, 2)));
 42 | 
 43 | #ifdef __sun
 44 | # define WAIT_AND_LOCK(fd) lockf(fd, F_TLOCK,0)
 45 | #else
 46 | # define WAIT_AND_LOCK(fd) flock(fd, LOCK_EX | LOCK_NB)
 47 | #endif
 48 | 
 49 | int main(int argc, char **argv)
 50 | {
 51 | 	char	*Argv0 = *argv;
 52 | 	int	rc;
 53 | 	int	lfd;
 54 | 	pid_t	childpid;
 55 | 	time_t	starttime;
 56 | 
 57 | 	UNUSED_PARAMETER(argc);
 58 | 
 59 | 	time(&starttime);
 60 | 
 61 | 	for ( argv++ ; *argv ; argv++ )
 62 | 	{
 63 | 		char    *arg = *argv;
 64 | 		char	*opt = strchr(arg, '=');
 65 | 
 66 | 		/* the -- token marks the end of the list */
 67 | 
 68 | 		if ( strcmp(*argv, "--") == 0 )
 69 | 		{
 70 | 			argv++;
 71 | 			break;
 72 | 		}
 73 | 
 74 | 		if (opt) *opt++ = '\0'; /* pick off the =VALUE part */
 75 | 
 76 | 		if ( STRMATCH(arg, "-L") || STRMATCH(arg, "--lockfile"))
 77 | 		{
 78 | 			lockfile = getarg(opt, &argv);
 79 | 		}
 80 | 
 81 | 		else if ( STRMATCH(arg, "-W") || STRMATCH(arg, "--wait"))
 82 | 		{
 83 | 			wait_for_lock = TRUE;
 84 | 		}
 85 | 
 86 | 		else if ( STRMATCH(arg, "-S") || STRMATCH(arg, "--sleep"))
 87 | 		{
 88 | 			sleeptime = atoi(getarg(opt, &argv));
 89 | 		}
 90 | 
 91 | 		else if ( STRMATCH(arg, "-T") || STRMATCH(arg, "--maxtime"))
 92 | 		{
 93 | 			Maxtime = atoi(getarg(opt, &argv));
 94 | 		}
 95 | 
 96 | 		else if ( STRMATCH(arg, "-V") || STRMATCH(arg, "--verbose"))
 97 | 		{
 98 | 			Verbose++;
 99 | 		}
100 | 		
101 | 		else if ( STRMATCH(arg, "-I") || STRMATCH(arg, "--idempotent"))
102 | 		{
103 | 			idempotent = TRUE;
104 | 		}
105 | 
106 | 		else
107 | 		{
108 | 			die("ERROR: \"%s\" is an invalid cmdline param", arg);
109 | 		}
110 | 	}
111 | 
112 | 	/*----------------------------------------------------------------
113 | 	 * SANITY CHECKING
114 | 	 *
115 | 	 * Make sure that we have all the parameters we require
116 | 	 */
117 | 	if (*argv == 0)
118 | 		die("ERROR: missing command to %s (must follow \"--\" marker) ", Argv0);
119 | 
120 | 	if (lockfile == 0)
121 | 		die("ERROR: missing --lockfile=F parameter");
122 | 
123 | 	/*----------------------------------------------------------------
124 | 	 * Open or create the lockfile, then try to acquire the lock. If
125 | 	 * the lock is acquired immediately (==0), then we're done, but
126 | 	 * if the lock is not available, we have to wait for it.
127 | 	 *
128 | 	 * We can either loop trying for the lock (for --wait), or exit
129 | 	 * with error.
130 | 	 */
131 | 
132 | 	if ( (lfd = open(lockfile, O_RDWR|O_CREAT, openmode)) < 0)
133 | 		die("ERROR: cannot open(%s) [err=%s]", lockfile, strerror(errno));
134 | 
135 | 	while ( WAIT_AND_LOCK(lfd) != 0 )
136 | 	{
137 | 		if ( ! wait_for_lock )
138 | 		{
139 | 			
140 | 			if(idempotent) /* given the idempotent flag, we treat contention as a no-op */
141 | 			{
142 | 				exit(EXIT_SUCCESS);
143 | 			}
144 | 			else
145 | 			{
146 | 				die("ERROR: cannot launch %s - run is locked", argv[0]);
147 | 			}
148 | 		}
149 | 
150 | 		/* waiting */
151 | 		if ( Verbose ) printf("(locked: sleeping %d secs)\n", sleeptime);
152 | 
153 | 		sleep(sleeptime);
154 | 	}
155 | 
156 | 	fflush(stdout);
157 | 
158 | 	/* run the child */
159 | 
160 | 
161 | 	if ( (childpid = fork()) == 0 )
162 | 	{
163 | 		close(lfd);		// don't need the lock file
164 | 
165 | 		/* Set rc to the result of execvp. This lets the parent know we failed. */
166 | 		rc = execvp(argv[0], argv);
167 | 	}
168 | 	else if ( childpid > 0 )
169 | 	{
170 | 		time_t endtime;
171 | 		pid_t  pid;
172 | 		int    status;
173 | 
174 | 		if ( Verbose )
175 | 		    printf("Waiting for process %ld\n", (long) childpid);
176 | 
177 | 		pid = waitpid(childpid, &status, 0);
178 | 		rc = WEXITSTATUS(status);
179 | 
180 | 		time(&endtime);
181 | 
182 | 		endtime -= starttime;
183 | 
184 | 		if ( Verbose || (Maxtime > 0  &&  endtime > Maxtime) )
185 | 		    printf("pid %d exited with status %d, exit code: %d (time=%ld sec)\n",
186 | 			   pid, status, rc, endtime);
187 | 	}
188 | 	else
189 | 	{
190 | 		die("ERROR: cannot fork [%s]", strerror(errno));
191 | 	}
192 | 
193 | 	exit(rc);
194 | }
195 | 
196 | 
197 | /*! \fn static char *getarg(char *opt, char ***pargv)
198 |  *  \brief A function to parse calling parameters
199 |  *
200 |  *	This is a helper for the main arg-processing loop: we work with
201 |  *	options which are either of the form "-X=FOO" or "-X FOO"; we
202 |  *	want an easy way to handle either one.
203 |  *
204 |  *	The idea is that if the parameter has an = sign, we use the rest
205 |  *	of that same argv[X] string, otherwise we have to get the *next*
206 |  *	argv[X] string. But it's an error if an option-requiring param
207 |  *	is at the end of the list with no argument to follow.
208 |  *
209 |  *	The option name could be of the form "-C" or "--conf", but we
210 |  *	grab it from the existing argv[] so we can report it well.
211 |  *
212 |  * \return character pointer to the argument
213 |  *
214 |  */
215 | static char *getarg(char *opt, char ***pargv)
216 | {
217 | 	const char *const optname = **pargv;
218 | 
219 | 	/* option already set? */
220 | 	if (opt) return opt;
221 | 
222 | 	/* advance to next argv[] and try that one */
223 | 	if ((opt = *++(*pargv)) == 0)
224 | 		die("ERROR: option %s requires a parameter", optname);
225 | 
226 | 	return opt;
227 | }
228 | 
229 | /*
230 |  * die()
231 |  *
232 |  *	Given a printf-style argument list, format it to the standard error,
233 |  *	append a newline, then exit with error status.
234 |  */
235 | 
236 | static void die(const char *format, ...)
237 | {
238 | va_list	args;
239 | 
240 | 	va_start(args, format);
241 | 	vfprintf(stderr, format, args);
242 | 	putc('\n', stderr);
243 | 	va_end(args);
244 | 
245 | 	exit(EXIT_FAILURE);
246 | }
247 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
  1 | lockrun - Run cron job with overrun protection
  2 | ==============================================
  3 | 
  4 | When doing network monitoring, it's common to run a cron job every five
  5 | minutes (the standard interval) to roam around the network gathering
  6 | data. Smaller installations may have no trouble running within this
  7 | limit, but in larger networks or those where devices are often
  8 | unreachable, running past the five-minute mark could happen frequently.
  9 | 
 10 | The effect of running over depends on the nature of the monitoring
 11 | application: it could be of no consequence or it could be catastrophic.
 12 | What's in common is that running two jobs at once (the old one which ran
 13 | over, plus the new one) slows down the new one, increasing the risk that
 14 | *it* will run long as well.
 15 | 
 16 | This is commonly a cascading failure which can take many polling
 17 | sessions to right itself, which may include lost data in the interim.
 18 | 
 19 | Our response has been to create this tool, `lockrun`, which serves as a
 20 | protective wrapper. Before launching the given command, it insures that
 21 | another instance of the same command is not already running.
 22 | 
 23 | Build and Install
 24 | =================
 25 | 
 26 | This tool is published in the form of portable C source code, and it can
 27 | be compiled on any Linux/UNIX platform that provides a development
 28 | environment.
 29 | 
 30 | To build and install it from the command line:
 31 | 
 32 |     $ gcc lockrun.c -o lockrun
 33 |     $ sudo cp lockrun /usr/local/bin/
 34 | 
 35 | Now we'll find `lockrun` in the usual place: `/usr/local/bin/`.
 36 | 
 37 | We'll note that though portable, this program is nevertheless designed
 38 | only to run on UNIX or Linux systems: it certainly won't build and run
 39 | properly on a Windows computer.
 40 | 
 41 | Furthermore, file locking has always been one of the more problemtic
 42 | areas of portability, there being several mechanisms in place. `lockrun`
 43 | uses the `flock()` system call, and this of course requires low-level OS
 44 | support.
 45 | 
 46 | We've tested this in FreeBSD and Linux, but other operating systems
 47 | might trip over compilation issues. We welcome portability reports (for
 48 | good or bad).
 49 | 
 50 | We've also received a report that this works on Apple's OS X.
 51 | 
 52 | Example Usage
 53 | =============
 54 | 
 55 | Once `lockrun` has been built and installed, it's time to put it
 56 | to work. This is virtually always used in a crontab entry, and the
 57 | command line should include the name of the lockfile to use as well
 58 | as the command to run.
 59 | 
 60 | This entry in a crontab file runs the Cacti poller script every five
 61 | minutes, protected by a lockfile:
 62 | 
 63 |     */5 * * * * /usr/local/bin/lockrun --lockfile=/tmp/cacti.lockrun -- /usr/local/bin/cron-cacti-poller
 64 | 
 65 | The file used, `/tmp/cacti.lockrun`, is created (if necessary), the lock
 66 | acquired, and closed when finished. At no time does `lockrun` perform
 67 | any file I/O: the file exists only to be the subject of locking
 68 | requests.
 69 | 
 70 | Note that everything up to the standalone `--` is considered an option
 71 | to `lockrun`, but everything after is the literal command to run.
 72 | 
 73 | The example provided here is a run-or-nothing instance: if the lock
 74 | cannot be acquired, the program exits with a failure message to the
 75 | standard error stream, which hopefully is routed back to the user via an
 76 | email notification:
 77 | 
 78 | `ERROR: cannot launch [command line] - run is locked`
 79 | 
 80 | This mechanism effectively *skips a polling run*, but this may be the
 81 | only option when polling runs long periodically. If one polling run goes
 82 | quite long, it's conceivable that multiple subsequent jobs could be
 83 | stacked behind the slow one, and never getting caught up.
 84 | 
 85 | But if most jobs complete very rapidly, adding the `--wait`
 86 | parameter might allow the system to catch up after a lone straggler
 87 | runs long.
 88 | 
 89 | 
 90 | However one organizes this, one can't avoid being concerned with runs
 91 | which are locked often. An inability to complete a polling run on time
 92 | indicates a resource-allocation problem which is not actually fixed by
 93 | skipping some data.
 94 | 
 95 | If this happens regularly, it's important to track down what's causing
 96 | the overruns: lack of memory? inadequate CPU? serialized jobs which
 97 | could benefit from parallelization or asynchronous processing?
 98 | 
 99 | There is no substitute for actual human observation of important
100 | systems, and though `lockrun` may forestall a monitoring meltdown, it
101 | doesn't replace paying attention. It is **not** an advanced command
102 | queuing system.
103 | 
104 | Locking Behavior
105 | ================
106 | 
107 | We've been asked why we do this in a C program and not a simple shell
108 | script: the answer is that we require bulletproof, no-maintenance
109 | protection, and that's very hard to do with shell scripting.
110 | 
111 | With touch-a-file locking, there's a chance that the lockfile can be
112 | left around after everything is done: what if the cron job has run long,
113 | and the administrator killed everything associated with the job? What
114 | about a system crash leaving the lockfile around? What if there's a
115 | fatal error in `lockrun` itself? All of these leave the lockfile around
116 | in the system for the next run to trip over.
117 | 
118 | One could make this mechanism smarter by including the `PID` of the
119 | locking process inside the file, and then using `kill(*pid*,0)` to see
120 | if that process exists, but PIDs are reused, and it's possible to have a
121 | false positive (i.e., when the previous `lockrun` has finished, but some
122 | *other* process has taken that PID slot). We've always disliked the
123 | nondeterminism of this mechanism.
124 | 
125 | So we required a mechanism which provided guaranteed, bulletproof
126 | cleanup at program exit, and no chance of false positives. Though one
127 | can find numerous mechanisms for this, use of file locks is the easiest
128 | to code and understand. Setting a lock automatically tests for the
129 | previous lock, and this means no race conditions to worry about. When
130 | the file is closed, locks evaporate.
131 | 
132 | Note that file locking under UNIX is typically *advisory* only: A lock
133 | placed by one process is only honored by other processes who chose to
134 | check the lock first. Any process with suitable permissions is free to
135 | read or write anything without regard to locks.
136 | 
137 | Advisory locking works on the honor system, but they're entirely
138 | appropriate for our use here.
139 | 
140 | Finally, we'll note that our locking mechanism is only designed to
141 | prevent two lock-protected processes from running at once; It is *not* a
142 | queuing system.
143 | 
144 | When using the `--wait` parameter, it's entirely possible to have many
145 | processes stacked up in line behind a prior long-running process. When
146 | the long-running process exists, it's impossible to predict which of the
147 | waiting processes will run next, and it's probably not going to be done
148 | in the order in which they were launched. Users with more sophisticated
149 | queuing requirements probably need to find a different mechanism.
150 | 
151 | Command-Line Options
152 | ====================
153 | 
154 | `lockrun` supports GNU-style command-line options, and this includes
155 | using `--` to mark their end:
156 | 
157 |     $ lockrun [options] -- [command]
158 | 
159 | The actual command after `--` can have any arguments it likes, and they
160 | are entirely uninterpreted by `lockrun`.
161 | 
162 | We'll note that command-line redirection (`> /dev/null`, etc.) is not
163 | supported by this or the command which follows -- it's handled **by the
164 | calling shell**. This is the case whether it's run from cron or not.
165 | 
166 |  * `--idempotent`
167 | 
168 |   > Allows silent successful exit when lock contention is encountered.
169 | 
170 |  * `--lockfile=[filename]`
171 | 
172 |   > Specify the name of a file which is used for locking. This filename
173 |   > is created if necessary (with mode 0666), and no I/O of any kind is
174 |   > done. This file is never removed.
175 | 
176 |  * `--maxtime=[N]`
177 | 
178 |   > The script being controlled ought to run for no more than *N*
179 |   > seconds, and if it's beyond that time, we should report it to the
180 |   > standard error stream (which probably gets routed to the user via
181 |   > cron's email).
182 | 
183 | 
184 |  * `--wait`
185 | 
186 |   > When a pre-existing lock is found, this program normally exits with
187 |   > error, but adding the `--wait` parameter causes it to loop, waiting
188 |   > for the prior lock to be released.
189 | 
190 |  * `--verbose`
191 | 
192 |   > Show a bit more runtime debugging.
193 | 
194 |   * `--`
195 | 
196 |   > Mark the end of the options, the actual command to run follows.
197 | 
198 | History
199 | =======
200 | 
201 |  * 2013/08/02 - return execvp's value if running child process fails (Allard Hoeve)
202 |  * 2010/10/04 - added idempotency to allow run lock contention to be treated as a no-op (Mike Cerna, Groupon)
203 |  * 2009/06/25 — added lockf() support for Solaris 10 (thanks to Michal Bella)
204 |  * 2009/03/09 — Tracked on GitHub by Peter Harkins.
205 |  * 2006/06/03 — initial release by Stephen J. Friedl. <http://unixwiz.net/archives/2006/06/new_tool_lockru.html>
206 | 
207 | License
208 | =======
209 | 
210 | This software is public domain.
211 | 


--------------------------------------------------------------------------------