├── .gitignore ├── lockrun.c └── README.markdown /.gitignore: -------------------------------------------------------------------------------- 1 | lockrun 2 | -------------------------------------------------------------------------------- /lockrun.c: -------------------------------------------------------------------------------- 1 | /* 2 | * See README.markdown for build, install, and usage instructions. 3 | */ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | #ifndef __GNUC__ 18 | # define __attribute__(x) /* nothing */ 19 | #endif 20 | 21 | 22 | #define STRMATCH(a,b) (strcmp((a),(b)) == 0) 23 | 24 | #define UNUSED_PARAMETER(v) ((void)(v)) 25 | 26 | #define TRUE 1 27 | #define FALSE 0 28 | 29 | static const char *lockfile = 0; 30 | static int wait_for_lock = FALSE; 31 | static mode_t openmode = 0666; 32 | static int sleeptime = 10; /* seconds */ 33 | static int Verbose = FALSE; 34 | static int Maxtime = 0; 35 | static int idempotent = FALSE; 36 | 37 | static char *getarg(char *opt, char ***pargv); 38 | 39 | static void die(const char *format, ...) 40 | __attribute__((noreturn)) 41 | __attribute__((format(printf, 1, 2))); 42 | 43 | #ifdef __sun 44 | # define WAIT_AND_LOCK(fd) lockf(fd, F_TLOCK,0) 45 | #else 46 | # define WAIT_AND_LOCK(fd) flock(fd, LOCK_EX | LOCK_NB) 47 | #endif 48 | 49 | int main(int argc, char **argv) 50 | { 51 | char *Argv0 = *argv; 52 | int rc; 53 | int lfd; 54 | pid_t childpid; 55 | time_t starttime; 56 | 57 | UNUSED_PARAMETER(argc); 58 | 59 | time(&starttime); 60 | 61 | for ( argv++ ; *argv ; argv++ ) 62 | { 63 | char *arg = *argv; 64 | char *opt = strchr(arg, '='); 65 | 66 | /* the -- token marks the end of the list */ 67 | 68 | if ( strcmp(*argv, "--") == 0 ) 69 | { 70 | argv++; 71 | break; 72 | } 73 | 74 | if (opt) *opt++ = '\0'; /* pick off the =VALUE part */ 75 | 76 | if ( STRMATCH(arg, "-L") || STRMATCH(arg, "--lockfile")) 77 | { 78 | lockfile = getarg(opt, &argv); 79 | } 80 | 81 | else if ( STRMATCH(arg, "-W") || STRMATCH(arg, "--wait")) 82 | { 83 | wait_for_lock = TRUE; 84 | } 85 | 86 | else if ( STRMATCH(arg, "-S") || STRMATCH(arg, "--sleep")) 87 | { 88 | sleeptime = atoi(getarg(opt, &argv)); 89 | } 90 | 91 | else if ( STRMATCH(arg, "-T") || STRMATCH(arg, "--maxtime")) 92 | { 93 | Maxtime = atoi(getarg(opt, &argv)); 94 | } 95 | 96 | else if ( STRMATCH(arg, "-V") || STRMATCH(arg, "--verbose")) 97 | { 98 | Verbose++; 99 | } 100 | 101 | else if ( STRMATCH(arg, "-I") || STRMATCH(arg, "--idempotent")) 102 | { 103 | idempotent = TRUE; 104 | } 105 | 106 | else 107 | { 108 | die("ERROR: \"%s\" is an invalid cmdline param", arg); 109 | } 110 | } 111 | 112 | /*---------------------------------------------------------------- 113 | * SANITY CHECKING 114 | * 115 | * Make sure that we have all the parameters we require 116 | */ 117 | if (*argv == 0) 118 | die("ERROR: missing command to %s (must follow \"--\" marker) ", Argv0); 119 | 120 | if (lockfile == 0) 121 | die("ERROR: missing --lockfile=F parameter"); 122 | 123 | /*---------------------------------------------------------------- 124 | * Open or create the lockfile, then try to acquire the lock. If 125 | * the lock is acquired immediately (==0), then we're done, but 126 | * if the lock is not available, we have to wait for it. 127 | * 128 | * We can either loop trying for the lock (for --wait), or exit 129 | * with error. 130 | */ 131 | 132 | if ( (lfd = open(lockfile, O_RDWR|O_CREAT, openmode)) < 0) 133 | die("ERROR: cannot open(%s) [err=%s]", lockfile, strerror(errno)); 134 | 135 | while ( WAIT_AND_LOCK(lfd) != 0 ) 136 | { 137 | if ( ! wait_for_lock ) 138 | { 139 | 140 | if(idempotent) /* given the idempotent flag, we treat contention as a no-op */ 141 | { 142 | exit(EXIT_SUCCESS); 143 | } 144 | else 145 | { 146 | die("ERROR: cannot launch %s - run is locked", argv[0]); 147 | } 148 | } 149 | 150 | /* waiting */ 151 | if ( Verbose ) printf("(locked: sleeping %d secs)\n", sleeptime); 152 | 153 | sleep(sleeptime); 154 | } 155 | 156 | fflush(stdout); 157 | 158 | /* run the child */ 159 | 160 | 161 | if ( (childpid = fork()) == 0 ) 162 | { 163 | close(lfd); // don't need the lock file 164 | 165 | /* Set rc to the result of execvp. This lets the parent know we failed. */ 166 | rc = execvp(argv[0], argv); 167 | } 168 | else if ( childpid > 0 ) 169 | { 170 | time_t endtime; 171 | pid_t pid; 172 | int status; 173 | 174 | if ( Verbose ) 175 | printf("Waiting for process %ld\n", (long) childpid); 176 | 177 | pid = waitpid(childpid, &status, 0); 178 | rc = WEXITSTATUS(status); 179 | 180 | time(&endtime); 181 | 182 | endtime -= starttime; 183 | 184 | if ( Verbose || (Maxtime > 0 && endtime > Maxtime) ) 185 | printf("pid %d exited with status %d, exit code: %d (time=%ld sec)\n", 186 | pid, status, rc, endtime); 187 | } 188 | else 189 | { 190 | die("ERROR: cannot fork [%s]", strerror(errno)); 191 | } 192 | 193 | exit(rc); 194 | } 195 | 196 | 197 | /*! \fn static char *getarg(char *opt, char ***pargv) 198 | * \brief A function to parse calling parameters 199 | * 200 | * This is a helper for the main arg-processing loop: we work with 201 | * options which are either of the form "-X=FOO" or "-X FOO"; we 202 | * want an easy way to handle either one. 203 | * 204 | * The idea is that if the parameter has an = sign, we use the rest 205 | * of that same argv[X] string, otherwise we have to get the *next* 206 | * argv[X] string. But it's an error if an option-requiring param 207 | * is at the end of the list with no argument to follow. 208 | * 209 | * The option name could be of the form "-C" or "--conf", but we 210 | * grab it from the existing argv[] so we can report it well. 211 | * 212 | * \return character pointer to the argument 213 | * 214 | */ 215 | static char *getarg(char *opt, char ***pargv) 216 | { 217 | const char *const optname = **pargv; 218 | 219 | /* option already set? */ 220 | if (opt) return opt; 221 | 222 | /* advance to next argv[] and try that one */ 223 | if ((opt = *++(*pargv)) == 0) 224 | die("ERROR: option %s requires a parameter", optname); 225 | 226 | return opt; 227 | } 228 | 229 | /* 230 | * die() 231 | * 232 | * Given a printf-style argument list, format it to the standard error, 233 | * append a newline, then exit with error status. 234 | */ 235 | 236 | static void die(const char *format, ...) 237 | { 238 | va_list args; 239 | 240 | va_start(args, format); 241 | vfprintf(stderr, format, args); 242 | putc('\n', stderr); 243 | va_end(args); 244 | 245 | exit(EXIT_FAILURE); 246 | } 247 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | lockrun - Run cron job with overrun protection 2 | ============================================== 3 | 4 | When doing network monitoring, it's common to run a cron job every five 5 | minutes (the standard interval) to roam around the network gathering 6 | data. Smaller installations may have no trouble running within this 7 | limit, but in larger networks or those where devices are often 8 | unreachable, running past the five-minute mark could happen frequently. 9 | 10 | The effect of running over depends on the nature of the monitoring 11 | application: it could be of no consequence or it could be catastrophic. 12 | What's in common is that running two jobs at once (the old one which ran 13 | over, plus the new one) slows down the new one, increasing the risk that 14 | *it* will run long as well. 15 | 16 | This is commonly a cascading failure which can take many polling 17 | sessions to right itself, which may include lost data in the interim. 18 | 19 | Our response has been to create this tool, `lockrun`, which serves as a 20 | protective wrapper. Before launching the given command, it insures that 21 | another instance of the same command is not already running. 22 | 23 | Build and Install 24 | ================= 25 | 26 | This tool is published in the form of portable C source code, and it can 27 | be compiled on any Linux/UNIX platform that provides a development 28 | environment. 29 | 30 | To build and install it from the command line: 31 | 32 | $ gcc lockrun.c -o lockrun 33 | $ sudo cp lockrun /usr/local/bin/ 34 | 35 | Now we'll find `lockrun` in the usual place: `/usr/local/bin/`. 36 | 37 | We'll note that though portable, this program is nevertheless designed 38 | only to run on UNIX or Linux systems: it certainly won't build and run 39 | properly on a Windows computer. 40 | 41 | Furthermore, file locking has always been one of the more problemtic 42 | areas of portability, there being several mechanisms in place. `lockrun` 43 | uses the `flock()` system call, and this of course requires low-level OS 44 | support. 45 | 46 | We've tested this in FreeBSD and Linux, but other operating systems 47 | might trip over compilation issues. We welcome portability reports (for 48 | good or bad). 49 | 50 | We've also received a report that this works on Apple's OS X. 51 | 52 | Example Usage 53 | ============= 54 | 55 | Once `lockrun` has been built and installed, it's time to put it 56 | to work. This is virtually always used in a crontab entry, and the 57 | command line should include the name of the lockfile to use as well 58 | as the command to run. 59 | 60 | This entry in a crontab file runs the Cacti poller script every five 61 | minutes, protected by a lockfile: 62 | 63 | */5 * * * * /usr/local/bin/lockrun --lockfile=/tmp/cacti.lockrun -- /usr/local/bin/cron-cacti-poller 64 | 65 | The file used, `/tmp/cacti.lockrun`, is created (if necessary), the lock 66 | acquired, and closed when finished. At no time does `lockrun` perform 67 | any file I/O: the file exists only to be the subject of locking 68 | requests. 69 | 70 | Note that everything up to the standalone `--` is considered an option 71 | to `lockrun`, but everything after is the literal command to run. 72 | 73 | The example provided here is a run-or-nothing instance: if the lock 74 | cannot be acquired, the program exits with a failure message to the 75 | standard error stream, which hopefully is routed back to the user via an 76 | email notification: 77 | 78 | `ERROR: cannot launch [command line] - run is locked` 79 | 80 | This mechanism effectively *skips a polling run*, but this may be the 81 | only option when polling runs long periodically. If one polling run goes 82 | quite long, it's conceivable that multiple subsequent jobs could be 83 | stacked behind the slow one, and never getting caught up. 84 | 85 | But if most jobs complete very rapidly, adding the `--wait` 86 | parameter might allow the system to catch up after a lone straggler 87 | runs long. 88 | 89 | 90 | However one organizes this, one can't avoid being concerned with runs 91 | which are locked often. An inability to complete a polling run on time 92 | indicates a resource-allocation problem which is not actually fixed by 93 | skipping some data. 94 | 95 | If this happens regularly, it's important to track down what's causing 96 | the overruns: lack of memory? inadequate CPU? serialized jobs which 97 | could benefit from parallelization or asynchronous processing? 98 | 99 | There is no substitute for actual human observation of important 100 | systems, and though `lockrun` may forestall a monitoring meltdown, it 101 | doesn't replace paying attention. It is **not** an advanced command 102 | queuing system. 103 | 104 | Locking Behavior 105 | ================ 106 | 107 | We've been asked why we do this in a C program and not a simple shell 108 | script: the answer is that we require bulletproof, no-maintenance 109 | protection, and that's very hard to do with shell scripting. 110 | 111 | With touch-a-file locking, there's a chance that the lockfile can be 112 | left around after everything is done: what if the cron job has run long, 113 | and the administrator killed everything associated with the job? What 114 | about a system crash leaving the lockfile around? What if there's a 115 | fatal error in `lockrun` itself? All of these leave the lockfile around 116 | in the system for the next run to trip over. 117 | 118 | One could make this mechanism smarter by including the `PID` of the 119 | locking process inside the file, and then using `kill(*pid*,0)` to see 120 | if that process exists, but PIDs are reused, and it's possible to have a 121 | false positive (i.e., when the previous `lockrun` has finished, but some 122 | *other* process has taken that PID slot). We've always disliked the 123 | nondeterminism of this mechanism. 124 | 125 | So we required a mechanism which provided guaranteed, bulletproof 126 | cleanup at program exit, and no chance of false positives. Though one 127 | can find numerous mechanisms for this, use of file locks is the easiest 128 | to code and understand. Setting a lock automatically tests for the 129 | previous lock, and this means no race conditions to worry about. When 130 | the file is closed, locks evaporate. 131 | 132 | Note that file locking under UNIX is typically *advisory* only: A lock 133 | placed by one process is only honored by other processes who chose to 134 | check the lock first. Any process with suitable permissions is free to 135 | read or write anything without regard to locks. 136 | 137 | Advisory locking works on the honor system, but they're entirely 138 | appropriate for our use here. 139 | 140 | Finally, we'll note that our locking mechanism is only designed to 141 | prevent two lock-protected processes from running at once; It is *not* a 142 | queuing system. 143 | 144 | When using the `--wait` parameter, it's entirely possible to have many 145 | processes stacked up in line behind a prior long-running process. When 146 | the long-running process exists, it's impossible to predict which of the 147 | waiting processes will run next, and it's probably not going to be done 148 | in the order in which they were launched. Users with more sophisticated 149 | queuing requirements probably need to find a different mechanism. 150 | 151 | Command-Line Options 152 | ==================== 153 | 154 | `lockrun` supports GNU-style command-line options, and this includes 155 | using `--` to mark their end: 156 | 157 | $ lockrun [options] -- [command] 158 | 159 | The actual command after `--` can have any arguments it likes, and they 160 | are entirely uninterpreted by `lockrun`. 161 | 162 | We'll note that command-line redirection (`> /dev/null`, etc.) is not 163 | supported by this or the command which follows -- it's handled **by the 164 | calling shell**. This is the case whether it's run from cron or not. 165 | 166 | * `--idempotent` 167 | 168 | > Allows silent successful exit when lock contention is encountered. 169 | 170 | * `--lockfile=[filename]` 171 | 172 | > Specify the name of a file which is used for locking. This filename 173 | > is created if necessary (with mode 0666), and no I/O of any kind is 174 | > done. This file is never removed. 175 | 176 | * `--maxtime=[N]` 177 | 178 | > The script being controlled ought to run for no more than *N* 179 | > seconds, and if it's beyond that time, we should report it to the 180 | > standard error stream (which probably gets routed to the user via 181 | > cron's email). 182 | 183 | 184 | * `--wait` 185 | 186 | > When a pre-existing lock is found, this program normally exits with 187 | > error, but adding the `--wait` parameter causes it to loop, waiting 188 | > for the prior lock to be released. 189 | 190 | * `--verbose` 191 | 192 | > Show a bit more runtime debugging. 193 | 194 | * `--` 195 | 196 | > Mark the end of the options, the actual command to run follows. 197 | 198 | History 199 | ======= 200 | 201 | * 2013/08/02 - return execvp's value if running child process fails (Allard Hoeve) 202 | * 2010/10/04 - added idempotency to allow run lock contention to be treated as a no-op (Mike Cerna, Groupon) 203 | * 2009/06/25 — added lockf() support for Solaris 10 (thanks to Michal Bella) 204 | * 2009/03/09 — Tracked on GitHub by Peter Harkins. 205 | * 2006/06/03 — initial release by Stephen J. Friedl. 206 | 207 | License 208 | ======= 209 | 210 | This software is public domain. 211 | --------------------------------------------------------------------------------