├── grun.prof.example ├── gwrap ├── init-d ├── Grun.pm ├── README ├── qsub ├── t └── grun.t ├── gwrap.c ├── xssh └── grun /grun.prof.example: -------------------------------------------------------------------------------- 1 | # GRUN AUTO-PROFILER 2 | # 3 | # README: 4 | # 5 | # This is a PERL script that operates on the $cmd variable to auto-set the $mem and $cpu variables 6 | # 7 | # The resulting $cpu and $mem variables are then used to increase the memory and cpu requests for 8 | # submitted jobs 9 | # 10 | # You can use any variable names you want... and simply return an array with 2 members ($cpu, $mem); 11 | # 12 | # This allows users to run jobs more naturally. 13 | # 14 | # EXAMPLE: 15 | # $cpu = 2 if $cmd =~ /\balchemy\b/ 16 | # $mem = 3000 if $cmd =~ /\tophat\b/ && -s $cmd[4] > 100000; 17 | -------------------------------------------------------------------------------- /gwrap: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 4 | # Wraps a program with a library that fetches missing files from the "master" server 5 | # Fetch program should cache not-found using some mechanism for a short period of time 6 | # Fetch program should write the file to disk with the same user & permissions as the original 7 | # 8 | 9 | # the defaults below work... but the fetch program doesn't compress data (yet) 10 | 11 | [ -z "$GRUN_CONF" ] && GRUN_CONF=/etc/grun.conf 12 | [ -z "$GFETCH_PROGRAM" ] && GFETCH_PROGRAM="grun -q -i -s file %s" 13 | 14 | master=`perl -ne 'print $1 if /^\s*master:\s*([^:]+)/i' $GRUN_CONF`; 15 | GFETCH_PATH=`perl -ne 'print $1 if /^\s*fetch_path:\s*(\S+)/i' $GRUN_CONF`; 16 | 17 | if [[ ! $HOSTNAME =~ "^$master" ]]; then 18 | export GFETCH_PROGRAM GFETCH_PATH 19 | export LD_PRELOAD=/var/lib/gwrap.so 20 | $* 21 | fi 22 | 23 | -------------------------------------------------------------------------------- /init-d: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | ### BEGIN INIT INFO 3 | # Provides: grun 4 | # Required-Start: $network $named $remote_fs $syslog 5 | # Required-Stop: $network $named $remote_fs $syslog 6 | # Default-Start: 2 3 4 5 7 | # Default-Stop: 0 1 6 8 | ### END INIT INFO 9 | ### CENTOS 10 | # chkconfig: - 99 40 11 | # description: grun startup script 12 | ### 13 | PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin 14 | DAEMON=/opt/bin/grun 15 | OPTS= 16 | NAME=grun 17 | DESC="grun Daemon" 18 | 19 | test -x $DAEMON || exit 0 20 | 21 | set -e 22 | 23 | case "$1" in 24 | start) 25 | echo -n "Starting $DESC: " 26 | $DAEMON -d $OPTS 27 | echo "$NAME." 28 | ;; 29 | stop) 30 | echo -n "Stopping $DESC: " 31 | $DAEMON -d -k $OPTS 32 | echo "$NAME." 33 | ;; 34 | reload) 35 | ;; 36 | restart|force-reload) 37 | $0 stop 38 | $0 start 39 | ;; 40 | *) 41 | N=/etc/init.d/$NAME 42 | # echo "Usage: $N {start|stop|restart|reload|force-reload}" >&2 43 | echo "Usage: $N {start|stop|restart|force-reload}" >&2 44 | exit 1 45 | ;; 46 | esac 47 | 48 | exit 0 49 | -------------------------------------------------------------------------------- /Grun.pm: -------------------------------------------------------------------------------- 1 | package Grun; 2 | 3 | use Exporter; 4 | 5 | use JSON::XS; 6 | use File::Temp qw(tempfile); 7 | use Carp; 8 | 9 | our @ISA=qw(Exporter); 10 | our @EXPORT=qw(grun grun_wait grun_kill); 11 | 12 | my %JTMP; 13 | sub grun_wait { 14 | my ($jid) = @_; 15 | my $ret = system("grun -q wait $jid 2>&1"); 16 | if ($ret) { 17 | $ret =(($ret<<8)&255) if $ret > 255; 18 | $ret = 1 if !$ret; 19 | } 20 | 21 | open (my $fh, '<', $JTMP{$jid} . ".out"); 22 | local $/=undef; 23 | my $out=<$fh>; 24 | close $fh; 25 | 26 | # copy pasted from below... make a function! 27 | $out=decode_json($out); 28 | if ($out->{err}) { 29 | # remote eval died... so we do too 30 | die $out->{err}; 31 | }; 32 | if (wantarray) { 33 | # return array 34 | return @{$out->{ret}}; 35 | } else { 36 | # return single value 37 | return $out->{ret}->[0]; 38 | } 39 | } 40 | 41 | sub grun { 42 | # this is only required on the execution node....so don't use it everywhere if not needed 43 | require B::RecDeparse; 44 | 45 | # at most 9 levels deep 46 | my $deparse=B::RecDeparse->new(level=>9); 47 | 48 | my ($op, $func, @args) = @_; 49 | croak("usage: grun({options}, \\\&function, \@args)") unless ref($func) eq 'CODE' && defined(wantarray); 50 | ($fh, $filename) = tempfile(".grun.XXXXXX", DIR=>"."); 51 | my $code=$deparse->coderef2text($func); 52 | my $def=encode_json({code=>$code, args=>\@args, wantarray=>wantarray}); 53 | print $fh $def; 54 | close $fh; 55 | 56 | my $opts; 57 | if ($op->{nowait}) { 58 | $opts = "-o $filename.out -nowait"; 59 | } 60 | 61 | my $cmd = "grun $opts $^X -MGrun -e \"\\\"Grun::exec('$filename')\\\"\""; 62 | 63 | # get output (json string) 64 | 65 | my $out = `$cmd`; 66 | if ($op->{nowait}) { 67 | my ($jid) = $out =~ /job_id.*:\s*(\d+)/i; 68 | $JTMP{$jid}=$filename; 69 | return $jid; 70 | } 71 | 72 | $out=decode_json($out); 73 | if ($out->{err}) { 74 | # remote eval died... so we do too 75 | die $out->{err}; 76 | }; 77 | if (wantarray) { 78 | # return array 79 | return @{$out->{ret}}; 80 | } else { 81 | # return single value 82 | return $out->{ret}->[0]; 83 | } 84 | } 85 | 86 | sub exec { 87 | my ($fil) = @_; 88 | local $/ = undef; 89 | open( my $fh, '<', $fil ); 90 | my $json = <$fh>; 91 | close $fh; 92 | 93 | my $hash=decode_json($json); 94 | my $sub = "sub " . $hash->{code}; 95 | $sub = eval($sub); 96 | my (@ret, $ret, $err); 97 | eval { 98 | if ($hash->{wantarray}) { 99 | @ret=&{$sub}(@{$hash->{args}}); 100 | } else { 101 | # scalar context 102 | $ret=&{$sub}(@{$hash->{args}}); 103 | @ret=(($ret)); 104 | } 105 | }; 106 | my $err=$@; 107 | my $out=encode_json({ret=>\@ret, err=>$err}); 108 | 109 | # return output via STDOUT 110 | print $out; 111 | } 112 | 113 | 114 | 1; 115 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | * Description: 2 | 3 | A lightweight replacement for job queueing systems like LSF, Torque, condor, SGE, for private clusters. 4 | 5 | * Installation: 6 | 7 | You need perl and ZMQ::LibZMQ3 8 | 9 | apt-get install libzmq3 10 | cpanm ZMQ::LibZMQ3 11 | 12 | Put it in /usr/bin. Type grun -h for help with the configuration/setup. Basically you stick it on all the machines, and you put a "master" in the main config. It is, far and away, the easiest grid job queuing system out there. 13 | 14 | Example /etc/grun.conf on the master node: 15 | 16 | master: foo.mydomain.local 17 | services: queue 18 | log_file: /var/log/grun.log 19 | 20 | Example /etc/grun.conf on a many compute nodes: 21 | 22 | master: foo.mydomain.local 23 | services: exec 24 | log_file: /var/log/grun.log 25 | 26 | Once you have those two daemons running (grun -d), then you can, on either node, do this: 27 | 28 | grun hostname 29 | 30 | And it will show you which node the command was run on, confirming that your installtion works. 31 | 32 | * What you do with it: 33 | 34 | Submit jobs, grun puts them in a queue, the queue is on disk so machines can lose connection any time, and jobs keep going. Specify resouces requirements, or not - and have it figure stuff out. 35 | 36 | grun will assign jobs randomly, and will tend to fill up one machine at a time, in case big jobs come along. It should figure NFS mounts, and the right thing to do with them autoamtically. It has a fast i/o subsystem suitable for terabyte files. If all the machines are busy, jobs will queue up. Resources are soft-locked by jobs until they are finished, and if a non-grun job is using a machine, it's accounted for. 37 | 38 | NOTE: Jobs that require changing sets of resources should call grun themselves - forcing later commands to requeue: 39 | 40 | IE: 41 | 42 | > grun -m 3000 myjob.sh 43 | 44 | --- myjob.sh: --- 45 | perl usesalotofmemory.pl 46 | grun -c 4 -m 500 perl uses4cpusbutlessram.pl 47 | 48 | The memory and cpu usage of the parent program will go to zero as the second script is launched. In this way very complex jobs can be scheduled, without a special workflow system. Just use bash, perl or ruby. 49 | 50 | * What it does now: 51 | 52 | It does the queueing, i/o, node-matching to requirements, suspend/resume, priorities, and has a great config system. The "auto conf" allows sysadmins to create policies based on the parameters or script contens of a job. Grun has run millions of jobs in a production cluster with about 52 nodes of 24-48 cores each. Grun times every command and records memory/cpu/io. Ulimits are set on jobs, and can be scaled to a multiple of requested (elbowing). Jobs are editable, killable & suspend/resumable. You can specify "make like" semantics on jobs and it will check inputs/outputs and run jobs only if needed. 53 | 54 | Grun comes with a perl module that allows you to execute perl functions on remote hosts by copying the memory and code over to the remote machine. For example: 55 | 56 | use Grun; 57 | my $results = grun(\&mycrazyfunction, "param"); 58 | 59 | It decompiles the function locally, and recompiles remotely. 60 | 61 | * What it doesn't do yet (TODO): 62 | 63 | - needs a "plugin" architecture. Should be easy with perl. 64 | - graceful restart... with socket handoff (it's always safe to restart... but it can cause waiters to pause) 65 | - harrass users when queue drive is close to full, or cpu/ram was too high to ever run, or other (configurable) issues that users have 66 | 67 | * Goals: 68 | 69 | Small Keep the code under a few k lines, should be readable. Lots of comments. 70 | Simple Easy configuration, guesses default values 71 | Smart Keeps track of stats. Learns what to do with things. Puts the right jobs on the right machines. 72 | Fast Hundreds of thousands of jobs per hour on tens of thousands of machines should be easy. 73 | Configurable Make up lots of new things you want it to keep track of and allocate to jobs (like disk i/o, user limits, etc.) 74 | 75 | * Features to avoid: 76 | 77 | No security Grun has no security. It has to be behind a firewall on dedicated machines. This limits it, and keeps it simple. It's not hard to put up an ssh tunnel, and make it work at EC2. But I'm not building in kerberos or stuff like that. 78 | 79 | One platform Grun only works on unix-like machines that have perl installed. 80 | 81 | Nothing fancy Grun doesn't support MPI and other fancy grid things (although you can layer map-reduce on it) 82 | 83 | * Advanced usage: 84 | 85 | Example of a config on an ssh tunneled EC2 node (tested), with autostaging via gwrap: 86 | 87 | services: exec 88 | master: 127.0.0.1:5184 89 | bind: 127.0.0.2:5184 90 | wrap: /usr/bin/gwrap 91 | fetch_path: /opt:/mnt/ufs 92 | 93 | gwrap (source only) is a useful tool. Used right, it will copy all the dependent files to the remote host, only as needed, and then, when the remote program is finished, it will copy all the output files back to the local host. Of course, this *could* be built in to grun, via the -M option, but it isn't yet. gwrap is not yet heavily used in production, and if you do decide to use it heavly, please let me know... I'll help add things like logging, failure recovery as needed. 94 | 95 | 96 | -------------------------------------------------------------------------------- /qsub: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # qsub -pe smp ${NPROC} -S /bin/bash -V -q secondary -N ${JOB_ID} -o ${STDOUT_FILE} -e ${STDERR_FILE} ${EXTRAS} ${CMD} 4 | 5 | use strict; 6 | use Getopt::Long qw(:config no_ignore_case); 7 | use Data::Dumper; 8 | use Cwd qw(getcwd cwd); 9 | 10 | open LOG, ">>/opt/log/qsub.log"; 11 | print LOG scalar(localtime), "\t", cwd(), "\t", join " ", @ARGV, "\n"; 12 | close LOG; 13 | 14 | my $wd = cwd(); 15 | 16 | $ENV{SGE_ROOT} = "$wd"; 17 | mkdir("qtmp"); 18 | $ENV{SGE_CELL} = "qtmp"; 19 | mkdir("qtmp/common"); 20 | open Z, ">qtmp/common/settings.sh"; 21 | close Z; 22 | chmod 0755, "qtmp/common/settings.sh"; 23 | 24 | my %opt; 25 | 26 | my $nproc; 27 | # get rid of weird 2-argument -pe option 28 | for (my $i=0;$i<@ARGV;++$i) { 29 | $_=$ARGV[$i]; 30 | if (/-pe/) { 31 | $opt{pe} = $ARGV[$i+1]; 32 | $nproc = $ARGV[$i+2]; 33 | splice @ARGV, $i, 3; 34 | last; 35 | } 36 | } 37 | 38 | GetOptions(\%opt, "A=s", "S=s", "V", "p=s", "q=s", "N=s", "hold_jid=s", "o=s", "e=s", "I", "j=s", "l=s@", "memory|m=s", "M=s", "v=s@", "sync=s", "cwd", "t=s") || die; 39 | $nproc = 1 if !$nproc; 40 | 41 | die "qsub: some weird option not supported: $ARGV[0]\n" if $ARGV[0] =~ /^-/; 42 | die "qsub: bad args for -pe\n" unless $nproc > 0; 43 | die "qsub: no command specified\n" unless @ARGV; 44 | 45 | for (@ARGV) { 46 | if (/\'/) { 47 | s/'/"'"/g; 48 | } else { 49 | if (/\"/) { 50 | if (/\'/) { 51 | s/'/"'"/g; 52 | } 53 | $_="'".$_."'"; 54 | } 55 | } 56 | } 57 | 58 | my $cmd = join " ", @ARGV; 59 | 60 | if ($opt{o}) { 61 | if (-d $opt{o}) { 62 | $cmd = "$cmd > $opt{o}/$$.log"; 63 | } else { 64 | $cmd = "$cmd > $opt{o}" 65 | } 66 | } else { 67 | die "qsub: need output file (-o)\n"; 68 | } 69 | 70 | 71 | my @opj; 72 | 73 | if ($opt{N}) { 74 | @opj = ("-j", "qsub$opt{N}"); 75 | $opt{e} = "STDIN.e$opt{N}" if !$opt{e}; 76 | $ENV{SGE_TASK_ID} = $opt{N}; 77 | } else { 78 | $ENV{SGE_TASK_ID} = time(); 79 | } 80 | 81 | # what does qsub do when no jobid is specified? 82 | $opt{e} = "STDIN.e" if !$opt{e}; 83 | $cmd = "$cmd 2> $opt{e}"; 84 | 85 | 86 | $opt{memory} = 1000 * $nproc; 87 | 88 | if ($opt{hold_jid}) { 89 | $opt{hold_jid} =~ s/\*/\.\*/g; 90 | my $running = `/opt/bin/grun -q jo -f '%jid\t%cwd\t%jobx\n' | grep '$opt{hold_jid}\$' | grep $wd`; 91 | $running =~ s/\s+//g; 92 | while ($running) { 93 | sleep(30); 94 | } 95 | } 96 | 97 | if ($ENV{GRUN_HOSTS}) { 98 | push @opj, ("-h", $ENV{GRUN_HOSTS}); 99 | } 100 | 101 | my $taskid = -1; 102 | if ($opt{t}) { 103 | my ($l, $h) = $opt{t} =~ /(\d+)-(\d+)/; 104 | if (!$l) { 105 | $l = 0; 106 | $h = $opt{t}-1; 107 | } 108 | $ENV{SGE_TASK_FIRST} = $l; 109 | $ENV{SGE_TASK_LAST} = $h; 110 | my @jobl; 111 | push @opj, "-nowait"; 112 | for (my $i = $l; $i<= $h; ++$i) { 113 | $ENV{SGE_TASK_ID} = $i; 114 | push @jobl, dojob(); 115 | } 116 | print "DEBUG START $opt{N}: jobs @jobl\n"; 117 | if($opt{sync}) { 118 | my $running = 1; 119 | while ($running) { 120 | sleep(15); 121 | my $runl = `/opt/bin/grun -q jo -f '%jid\\n'`; 122 | #my $runl = `/opt/bin/grun -q jo`; 123 | if (!$runl) { 124 | print "DEBUG NO JOBS???? [$?] $opt{N}: ((($runl)))\n"; 125 | next; 126 | } 127 | my %runl = map {chomp; $_=>1} $runl; 128 | my @runl = %runl; 129 | $running = 0; 130 | for (@jobl) { 131 | #if ($runl{$_}) { 132 | if ($runl =~ /$_/) { 133 | $running = 1; 134 | } 135 | } 136 | if (!$running) { 137 | print "DEBUG DONE $opt{N}: ((( $runl )))\n"; 138 | } else { 139 | print "DEBUG WAITING $opt{N}: ( $runl )\n"; 140 | } 141 | } 142 | sleep(15); 143 | } else { 144 | print "DEBUG NOSYNC $opt{N}\n"; 145 | } 146 | } else { 147 | if(!$opt{sync}) { 148 | push @opj, "-nowait"; 149 | } else { 150 | print "DEBUG SYNC 1 JOB $opt{N}\n"; 151 | } 152 | dojob(); 153 | } 154 | 155 | sub dojob { 156 | my $grun_out; 157 | my $tmpf = "/opt/tmp/qsub.$$.out"; 158 | my $ext = ""; 159 | if ($taskid >=0 ) { 160 | $tmpf = "/opt/tmp/qsub.$$.$taskid.out"; 161 | $ext = " [$taskid]"; 162 | } 163 | 164 | warn(join(" ", "+", "grun", "-v", "-c", $nproc, "-m", $opt{memory}, @opj, $cmd), "\n"); 165 | 166 | open OLDOUT, ">&STDOUT"; 167 | open OLDERR, ">&STDERR"; 168 | open STDOUT, ">$tmpf" or die "Can't open STDOUT: $!"; 169 | open STDERR, ">&STDOUT"; 170 | 171 | my $ret = system("grun", "-v", "-c", $nproc, "-m", $opt{memory}, @opj, $cmd); 172 | open STDOUT, ">&OLDOUT"; 173 | open STDERR, ">&OLDERR"; 174 | my %grun; 175 | open IN, $tmpf; 176 | while() { 177 | my ($k, $v) = m/([^:]+):?\s*(.*)/; 178 | $k =~ s/ /_/g; 179 | $grun{lc($k)}=$v;; 180 | } 181 | close IN; 182 | unlink $tmpf; 183 | if ($ret == 0) { 184 | print "Your job $grun{job_id}$ext \(\"$cmd\"\) has been submitted\n"; 185 | } 186 | if ($opt{sync} && ! -s $opt{e}) { 187 | unlink($opt{e}); 188 | } 189 | return $grun{job_id}; 190 | } 191 | -------------------------------------------------------------------------------- /t/grun.t: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | my $test_count; 4 | 5 | BEGIN { 6 | chdir($1) if ($0=~/(.*)\//); 7 | mkdir("tmp"); 8 | $test_count = 18; 9 | } 10 | 11 | use Cwd; 12 | use Test::More tests=>$test_count; 13 | use Time::HiRes qw(sleep time); 14 | use Getopt::Long; 15 | 16 | my $DEFPROG="../grun"; 17 | my $prog=$DEFPROG; 18 | GetOptions("prog=s"=>\$prog); 19 | 20 | if (!($DEFPROG eq $prog)) { 21 | diag("Using program $prog\n"); 22 | } 23 | 24 | my $tmpdir = cwd() . "/tmp"; 25 | 26 | `rm -rf $tmpdir/grun-test.*`; 27 | 28 | # write out a config file listening on 127.0.0.1 port 9418 29 | 30 | my $CONF = "$tmpdir/grun-test.conf"; 31 | 32 | mkdir "$tmpdir/grun-test.spool"; 33 | 34 | ok(-d "$tmpdir/grun-test.spool", "grun spool"); 35 | 36 | # todo... get an ephemeral port and use it 37 | 38 | my $cwd = cwd(); 39 | 40 | open O, ">$CONF"; 41 | print O <<"EOF"; 42 | master: 127.0.0.1 43 | port: 9417 44 | services: queue,exec 45 | log_file: $tmpdir/grun-test.log 46 | spool: $tmpdir/grun-test.spool 47 | pid_file: $tmpdir/grun-test.pid 48 | cpus: 4 49 | env: * 50 | auto_profile: /etc/grun.prof 51 | expire_secs: 1200 52 | spread_pct: .05 53 | query_default: \$self 54 | log_types: note error warn debug 55 | param-io: 8 56 | trace: 2 57 | EOF 58 | 59 | 60 | #start execution daemon 61 | 62 | my $grun="$prog -C $CONF"; 63 | 64 | `pkill -9 -f $CONF`; 65 | 66 | $SIG{ALRM} = sub { die "Timeout while trying to initiate daemon, aborting test\n" }; 67 | alarm 120; 68 | 69 | diag("$grun -d"); 70 | 71 | is($ok=system("$grun -d 2> tmp/grun-test.d.err"),0,"grun daemon"); 72 | if ($ok) { 73 | $ok = $ok >> 8; 74 | diag("Daemon fail exit code:$ok, orig-code: $?, str: $!") if $ok; 75 | } 76 | alarm 0; 77 | $SIG{ALRM} = sub {}; 78 | 79 | sleep(.25); 80 | 81 | $pid =0+`cat $tmpdir/grun-test.pid`; 82 | cmp_ok($pid,">",1,"grun pid"); 83 | 84 | SKIP: { 85 | skip "grun won't start", $test_count-3 unless $pid > 1; 86 | 87 | $stat=`$grun -d -r`; 88 | ok($stat=~/Ok/i,"reload ok"); 89 | 90 | # simple 'hello' test 91 | diag("$grun echo hello"); 92 | $out=`$grun echo hello`; 93 | 94 | is($out, "hello\n", "hello works"); 95 | 96 | my $out =`$grun -nowait sleep 10`; 97 | $jobs=`$grun -q jo`; 98 | like($jobs, qr/sleep/,"jobs list"); 99 | 100 | my ($id) = $out =~ /Job_ID:\s*(\d+)/; 101 | 102 | ok($id>0, "Returns jobid on nowait"); 103 | 104 | $out=`$grun -k $id 2>&1`; 105 | like($out,qr/$id aborted/,"Kill works"); 106 | 107 | $out=`$grun -q hist -c 5000`; 108 | like($out,qr/sleep/,"Hist works"); 109 | 110 | $out=0+`$grun "yes | head -1000" | wc -l`; 111 | is($out,1000,"test pipe 1000"); 112 | 113 | local $SIG{ALRM} = sub { kill $pid }; 114 | 115 | # creates an 'abusive' batch file 116 | my $count=100; 117 | open O, ">$tmpdir/grun-test.fork.sh"; 118 | for ($i=0;$i<$count;++$i) { 119 | print O "$grun echo $i >> $tmpdir/grun-test.fork.out 2>&1 &\n"; 120 | } 121 | print O "wait\n"; 122 | close O; 123 | 124 | $took_too_long=0; 125 | 126 | if (!($pid = fork)) { 127 | # run a bunch of simultaneous gruns 128 | exec("bash $tmpdir/grun-test.fork.sh"); 129 | } else { 130 | # meanwhile... wait for that to finishe 131 | use POSIX ":sys_wait_h"; 132 | # wait for a while 133 | $start=time(); 134 | diag("fork/timing test wait"); 135 | while(waitpid($pid, WNOHANG)!=$pid) { 136 | sleep(.25); 137 | if (time() > $start+600) { 138 | $took_too_long=1; 139 | diag("took too long, killing"); 140 | kill 2, $pid; 141 | last; 142 | } 143 | } 144 | } 145 | 146 | # did it take too long? 147 | ok(!$took_too_long, "grun fork speed"); 148 | cmp_ok(0+`wc -l $tmpdir/grun-test.fork.out`, "==", $count, "fork got $count responses"); 149 | 150 | # ok now try even more.. , but using the nicer "nowait" semantic 151 | $count=200; 152 | for ($i=0;$i<$count;++$i) { 153 | `$grun -nowait 'echo $i >> $tmpdir/grun-test.nofork.out 2>&1'`; 154 | } 155 | 156 | # do things respond nicely when the queue is loaded? 157 | $stat=`$grun -q st`; 158 | ok($stat=~/Hostname/,"stat return"); 159 | 160 | # ok wait for stuff 161 | while(1) { 162 | sleep(.25); 163 | if (time() > $start+600) { 164 | $took_too_long=1; 165 | diag("took too long!"); 166 | `pkill -INT $CONF`; 167 | last; 168 | } 169 | last if $count == `wc -l $tmpdir/grun-test.nofork.out 2>/dev/null`; 170 | } 171 | 172 | # did it return in time? 173 | ok(!$took_too_long, "grun nofork speed"); 174 | cmp_ok(0+`wc -l $tmpdir/grun-test.nofork.out`, "==", $count, "nowait got $count responses"); 175 | 176 | # shut down the daemon 177 | is(system("$grun -d -k"),0,"grun daemon kill"); 178 | 179 | # linger a bit 180 | sleep(2); 181 | 182 | # is it running? 183 | $running = `pgrep -f $tmpdir/grun-test`; 184 | chomp($running); 185 | ok(!$running, "daemon stop worked"); 186 | 187 | # sleep a bit 188 | sleep(.25); 189 | 190 | # ensure it's really gon 191 | if (`pgrep -f $CONF` =~ /\d+/) { 192 | `pkill -9 -f $CONF`; 193 | ok(0, "no stranded childredn"); 194 | } else { 195 | ok(1, "no stranded childredn"); 196 | } 197 | 198 | } 199 | 200 | sleep(1); 201 | -------------------------------------------------------------------------------- /gwrap.c: -------------------------------------------------------------------------------- 1 | // neededd for RTLD_NEXT 2 | #define _GNU_SOURCE 3 | 4 | #include 5 | #include 6 | 7 | #define _FCNTL_H 8 | #include 9 | //#define _SYS_STAT_H 10 | //#include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | /* 22 | #ifndef stat 23 | extern int stat (__const char *__restrict __file, struct stat *buf); 24 | # else 25 | extern int stat64 (__const char *__restrict __file, struct stat64 *buf); 26 | #endif 27 | */ 28 | 29 | /* 30 | * Compile with: 31 | * 32 | * gcc -nostartfiles -fpic -shared -o gwrap.so gwrap.c -ldl 33 | */ 34 | 35 | /* #define DEBUG 1 */ 36 | 37 | static int (*next__open)(const char *, int, mode_t) = NULL; 38 | static int (*nextopen)(const char *, int, mode_t) = NULL; 39 | static int (*nextopen64)(const char *, int, mode_t) = NULL; 40 | static int (*nextstat)(int ver, const char *, struct stat *) = NULL; 41 | static int (*next__lxstat)(int ver, const char *, struct stat *) = NULL; 42 | static int (*nextstat64)(int ver, const char *, struct stat64 *) = NULL; 43 | static int (*next__lxstat64)(int ver, const char *, struct stat64 *) = NULL; 44 | 45 | #define BUF_MAX 4096 46 | static char buffer[BUF_MAX]; 47 | static char tmpname[4096]; 48 | static char *fetch_program = NULL; 49 | static int did_init = 0; 50 | 51 | void do_init(void); 52 | 53 | void _init(void) 54 | { 55 | if (did_init != 1) do_init(); 56 | } 57 | 58 | struct tree { 59 | char *p; 60 | int n; 61 | struct tree *l; 62 | struct tree *r; 63 | }; 64 | 65 | static struct tree *path_tree; 66 | 67 | void tree_insert(struct tree **n, const char *p, int lp) { 68 | if (!*n) { 69 | *n = (struct tree *) malloc(sizeof(struct tree)); 70 | (*n)->p = (char *) malloc(lp+1); 71 | strncpy((*n)->p, p, lp); 72 | (*n)->n=lp; 73 | (*n)->p[lp]='\0'; 74 | (*n)->r = (*n)->l = NULL; 75 | //printf("ADD %s %d, n:%x l:%x r:%x\n", p, lp, *n, &(*n)->l, &(*n)->r); 76 | } else { 77 | int l = lp > (*n)->n ? (*n)->n : lp; 78 | int rv = strncmp(p, (*n)->p, l); 79 | if (rv > 0) { 80 | tree_insert(&(*n)->r, p, lp); 81 | } else if (rv < 0) { 82 | tree_insert(&(*n)->l, p, lp); 83 | } else { 84 | if ((*n)->n > lp) { 85 | (*n)->p[lp]='\0'; 86 | (*n)->n=lp; 87 | } 88 | } 89 | } 90 | } 91 | 92 | int tree_search(struct tree *n, const char *p, int lp) { 93 | if (!n) return 0; 94 | int l = lp > n->n ? n->n : lp; 95 | int rv = strncmp(p, n->p, l); 96 | if (rv > 0) { 97 | return tree_search(n->r, p, lp); 98 | } else if (rv < 0) { 99 | return tree_search(n->l, p, lp); 100 | } else { 101 | return 1; 102 | } 103 | } 104 | 105 | void do_init(void) 106 | { 107 | char *p, *e; 108 | next__open = dlsym(RTLD_NEXT,"__open"); 109 | nextopen = dlsym(RTLD_NEXT,"open"); 110 | nextopen64 = dlsym(RTLD_NEXT,"open64"); 111 | nextstat = dlsym(RTLD_NEXT,"__xstat"); 112 | next__lxstat = dlsym(RTLD_NEXT,"__lxstat"); 113 | nextstat64 = dlsym(RTLD_NEXT,"__xstat64"); 114 | next__lxstat64 = dlsym(RTLD_NEXT,"__lxstat64"); 115 | 116 | fetch_program=getenv("GFETCH_PROGRAM"); 117 | if (fetch_program && !strstr(fetch_program, "%s")) { 118 | fprintf(stderr, "Warning: Invalid GFETCH_PROGRAM\n"); 119 | fetch_program=NULL; 120 | } 121 | p=getenv("GFETCH_PATH"); 122 | 123 | #ifdef DEBUG 124 | fprintf(stderr, "GFETCH INIT %s, %s\n", p, fetch_program); 125 | #endif 126 | 127 | e = strchr(p,':'); 128 | while (e) { 129 | tree_insert(&path_tree, p, e-p); 130 | p = e+1; 131 | e = strchr(p,':'); 132 | } 133 | tree_insert(&path_tree, p, strlen(p)); 134 | 135 | //printf("HERE %d\n", tree_search(path_tree, "/tmp", 4)); 136 | 137 | did_init = 1; 138 | } 139 | 140 | int openhandler(const char *pathname, int flags, int mode, int stat) 141 | { 142 | FILE *remote; 143 | int len = -1; 144 | pid_t pid; 145 | int pipes[2]; 146 | int fd, tb, nb, rv; 147 | char *p; 148 | 149 | if (*pathname == '/') { 150 | p = strrchr(pathname, '/'); 151 | rv = tree_search(path_tree, pathname, p-pathname); 152 | } else { 153 | *buffer='\0'; 154 | if (!getcwd(buffer, sizeof(buffer))) { 155 | buffer[BUF_MAX-1] ='\0'; 156 | } 157 | rv = tree_search(path_tree, buffer, strlen(buffer)); 158 | } 159 | if (!rv) return -1; 160 | 161 | #ifdef DEBUG 162 | printf("GotHandler %s, %x, %x\n", pathname, flags, mode); 163 | #endif 164 | snprintf(buffer, sizeof(buffer), fetch_program, pathname); 165 | 166 | /* program i call should not, also, preload me */ 167 | unsetenv("LD_PRELOAD"); 168 | unsetenv("GFETCH_PROGRAM"); 169 | if (rv=system(buffer)) { 170 | return rv; 171 | } 172 | 173 | #ifdef DEBUG 174 | fprintf(stderr, "FP OK %d\n", rv); 175 | #endif 176 | fd = (*nextopen)(pathname, flags, mode); 177 | return fd; 178 | } 179 | 180 | int open(const char *pathname, int flags, mode_t mode) 181 | { 182 | if (did_init != 1) do_init(); 183 | 184 | #ifdef DEBUG 185 | fprintf(stderr, "GotOPEN: %s, %s\n", pathname, fetch_program); 186 | #endif 187 | struct stat st; 188 | if (fetch_program && ((flags & O_ACCMODE) != O_CREAT)) { 189 | stat(pathname, &st); 190 | } 191 | return (*nextopen)(pathname, flags, mode); 192 | } 193 | 194 | int open64(const char *pathname, int flags, mode_t mode) 195 | { 196 | if (did_init != 1) do_init(); 197 | #ifdef DEBUG 198 | fprintf(stderr, "GotOPEN 64: %d, %s, %s\n", did_init, pathname, fetch_program); 199 | #endif 200 | struct stat st; 201 | if (fetch_program && ((flags & O_ACCMODE) != O_CREAT)) { 202 | stat(pathname, &st); 203 | } 204 | if (!nextopen64) { 205 | errno=EINVAL; 206 | return -1; 207 | } 208 | return (*nextopen64)(pathname, flags, mode); 209 | } 210 | 211 | int __open(const char *pathname, int flags, int mode) 212 | { 213 | if (did_init != 1) do_init(); 214 | #ifdef DEBUG 215 | fprintf(stderr, "Got__OPEN\n", pathname); 216 | #endif 217 | struct stat st; 218 | if (fetch_program && ((flags & O_ACCMODE) != O_CREAT)) { 219 | stat(pathname, &st); 220 | } 221 | return (*nextopen)(pathname, flags, mode); 222 | } 223 | 224 | 225 | int __xstat(int ver, const char *pathname, struct stat *buf) 226 | { 227 | if (did_init != 1) do_init(); 228 | int ret; 229 | ret=(*nextstat)(ver, pathname, buf); 230 | if (fetch_program && ret && errno == ENOENT) { 231 | openhandler(pathname, O_RDONLY, 00755, 1); 232 | ret=(*nextstat)(ver, pathname, buf); 233 | } 234 | return ret; 235 | } 236 | 237 | int __lxstat(int ver, const char *pathname, struct stat *buf) 238 | { 239 | if (did_init != 1) do_init(); 240 | int ret; 241 | ret=(*next__lxstat)(ver, pathname, buf); 242 | if (fetch_program && ret && errno == ENOENT) { 243 | openhandler(pathname, O_RDONLY, 00755, 1); 244 | ret=(*next__lxstat)(ver, pathname, buf); 245 | } 246 | return ret; 247 | } 248 | 249 | int __xstat64(int ver, const char *pathname, struct stat64 *buf) 250 | { 251 | if (did_init != 1) do_init(); 252 | int ret; 253 | ret=(*nextstat64)(ver, pathname, buf); 254 | if (fetch_program && ret && errno == ENOENT) { 255 | openhandler(pathname, O_RDONLY, 00755, 1); 256 | ret=(*nextstat64)(ver, pathname, buf); 257 | } 258 | return ret; 259 | } 260 | 261 | int __lxstat64(int ver, const char *pathname, struct stat64 *buf) 262 | { 263 | if (did_init != 1) do_init(); 264 | int ret; 265 | ret=(*next__lxstat64)(ver, pathname, buf); 266 | if (fetch_program && ret && errno == ENOENT) { 267 | openhandler(pathname, O_RDONLY, 00755, 1); 268 | ret=(*next__lxstat64)(ver, pathname, buf); 269 | } 270 | return ret; 271 | } 272 | 273 | -------------------------------------------------------------------------------- /xssh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | ######### xssh 4 | # launches a program on any number of nodes 5 | # can be the same on all, or variations based on expansion 6 | ######### 7 | 8 | # grun - lightweight jobs queueing system 9 | # Copyright (C) 2011 Erik Aronesty 10 | # 11 | # This program is free software: you can redistribute it and/or modify 12 | # it under the terms of the GNU General Public License as published by 13 | # the Free Software Foundation, either version 3 of the License, or 14 | # (at your option) any later version. 15 | # 16 | # This program is distributed in the hope that it will be useful, 17 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | # GNU General Public License for more details. 20 | # 21 | # You should have received a copy of the GNU General Public License 22 | # along with this program. If not, see . 23 | 24 | use strict; 25 | use Getopt::Long qw(:config pass_through); 26 | $Getopt::Long::order = $REQUIRE_ORDER; 27 | use IO::File; 28 | use Cwd qw(cwd); 29 | use File::Spec; 30 | 31 | use Data::Dumper; 32 | 33 | sub usage; 34 | 35 | our %opt; 36 | 37 | my $pwd = cwd(); 38 | die unless $pwd; 39 | 40 | my $defnodes; 41 | my %filerules; 42 | my @mnt; 43 | 44 | readconf(); 45 | 46 | GetOptions(\%opt, "nodes=s@", "echo", "plain", "seq", "sudo", "debug", "recursive","force","list") || die usage(); 47 | 48 | push @{$opt{nodes}}, $defnodes if (!defined($opt{nodes})); 49 | $opt{mode} = $0 =~ /diff/ ? 'diff' : $0 =~ /sync/ ? 'sync' : 'ssh'; 50 | 51 | die usage() unless @ARGV || $opt{list}; 52 | 53 | my $spid; 54 | # this kills kids when you press control-C 55 | sub on_int { 56 | kill 2, $spid if $spid; 57 | } 58 | 59 | $opt{nodes} = join ',', @{$opt{nodes}}; 60 | # expand node names 61 | my @nodes=expandnodes($opt{nodes}); 62 | 63 | # uniquify 64 | my %seen; 65 | foreach my $node (@nodes) {chomp; $seen{$node}=1 if $node; } 66 | @nodes=keys(%seen); 67 | 68 | if ($opt{list}) { 69 | for (@nodes) { 70 | print $_, "\n"; 71 | } 72 | exit 0; 73 | } 74 | 75 | print STDERR "Nodes: " . join(' ', @nodes). "\n\n"; 76 | 77 | my $windows= $^O =~ /^(ms)?win/i; 78 | my $host = $windows ? $ENV{COMPUTERNAME} : $ENV{HOSTNAME}; 79 | 80 | # on windows, plink is the ssh equivalent TODO: config this 81 | my $ssh = $windows?'plink':'ssh'; 82 | my $sshopts = '-o ForwardX11=no -o ForwardX11Trusted=no -o ConnectTimeout=30 -o SendEnv="PATH LANG" -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no'; 83 | 84 | # launch kids on the nodes 85 | 86 | if ($opt{mode} eq 'diff') { 87 | for (@ARGV) { 88 | my $f = File::Spec->rel2abs($_); 89 | if (! -s $f) { 90 | warn "file not found: $f"; 91 | next; 92 | } 93 | if (-d $f) { 94 | die "Won't diff $f recursively"; 95 | } 96 | for (@mnt) { 97 | if ($f =~ /^\Q$_\E/) { 98 | die "$_ is mounted now, so no need to $opt{mode}" 99 | } 100 | } 101 | my $fb = $f; 102 | $fb =~ s/.*\///; 103 | for my $node (@nodes) { 104 | next if $ENV{HOSTNAME} =~ /^$node\b/; 105 | 106 | my $opt; 107 | if (!$opt && $opt{sudo}) { 108 | $opt="--rsync-path='sudo rsync' "; 109 | } 110 | $opt = "--recursive $opt" if $opt{recursive}; 111 | my $tmp = "/tmp/xdiff:$node:$fb"; 112 | fexec($ENV{HOSTNAME}, "rsync -e 'ssh $sshopts' -lpgtoD $opt $node:$f $tmp"); 113 | } 114 | while (wait != -1) {}; 115 | for my $node (@nodes) { 116 | next if $ENV{HOSTNAME} =~ /^$node/; 117 | my $tmp = "/tmp/xdiff:$node:$fb"; 118 | my $pre = "sudo " if $opt{sudo}; 119 | system("${pre}diff -u $tmp $f"); 120 | unlink $tmp; 121 | } 122 | } 123 | } 124 | 125 | if ($opt{mode} eq 'sync') { 126 | for (@ARGV) { 127 | my $f = File::Spec->rel2abs($_); 128 | if (! -s $f) { 129 | warn "file not found: $f"; 130 | next; 131 | } 132 | 133 | for (@mnt) { 134 | if ($f =~ /^\Q$_\E/) { 135 | die "$_ is mounted now, so no need to $opt{mode}\n" unless $opt{force} 136 | } 137 | } 138 | 139 | my %nosync; 140 | if ($filerules{$f}) { 141 | for (@{$filerules{$f}}) { 142 | my ($rule, $nodes) = @$_; 143 | if ($rule eq 'nosync') { 144 | if ($nodes eq '*') { 145 | die "Won't sync $f because of rule in config"; 146 | } else { 147 | for (expandnodes($nodes)) { 148 | $nosync{$_} = 1; 149 | } 150 | } 151 | } 152 | } 153 | } 154 | if (-d $f && !$opt{recursive}) { 155 | die "Won't sync $f recursively unless you specify -r or --recursive\n"; 156 | } 157 | for my $node (@nodes) { 158 | my $opt; 159 | next if $ENV{HOSTNAME} =~ /^$node\./; 160 | if ($nosync{$node}) { 161 | print "Skipping $node:$f because of config.\n"; 162 | next; 163 | } 164 | 165 | if (!$opt && $opt{sudo}) { 166 | $opt="--rsync-path='sudo rsync' "; 167 | } 168 | $opt = "--recursive $opt" if $opt{recursive}; 169 | my $d = $f; 170 | $f = $f . '/' if (-d $f); 171 | fexec($ENV{HOSTNAME}, "rsync -e 'ssh $sshopts' -lpgtoD $opt$f $node:$d"); 172 | } 173 | } 174 | } 175 | 176 | if ($opt{mode} eq 'ssh') { 177 | my @cmd=@ARGV; 178 | for my $node (@nodes) { 179 | if ($opt{seq}) { 180 | nexec($node, @cmd); 181 | } else { 182 | fexec($node, @cmd); 183 | } 184 | } 185 | } 186 | 187 | # wait for all kids 188 | while (wait != -1) {}; 189 | 190 | sub nexec { 191 | my ($node, @cmd) = @_; 192 | 193 | my $cmd; 194 | my $norig = $node; 195 | 196 | grep s/(["()'])/\\$1/g, @cmd if ($opt{mode} eq 'ssh'); 197 | $cmd = join " ", @cmd; 198 | $cmd= "cd ${pwd} && " . $cmd if ($opt{mode} eq 'ssh'); 199 | 200 | print "in nexec $host ENVH $ENV{HOSTNAME}\n" if $opt{debug}; 201 | if (!($ENV{HOSTNAME} =~ /^$node/)) { # not on my own machine 202 | print "$host is host\n" if $opt{debug}; 203 | 204 | # if you're on windows... specify the username 205 | my $prefix = "$ENV{USERNAME}\@" if $windows; 206 | 207 | # quote the command 208 | $cmd =~ s/(["\$])/\\$1/g; 209 | $cmd= "$ssh $sshopts $prefix$node \"$cmd\""; 210 | print "mod cmd is $cmd\n" if $opt{debug}; 211 | } 212 | 213 | print $cmd, "\n" if $opt{echo}; 214 | 215 | if ($opt{plain}) { 216 | system($cmd); 217 | } else { 218 | # capture stderr and stdout 219 | $spid=open(my $io=new IO::File, "( $cmd ) 2>&1 |"); 220 | if (!$spid) { 221 | # failed to exec 222 | die "$norig\t$!\n"; 223 | } 224 | # show output, with nodes prefixed 225 | while(<$io>) { 226 | next if $_ =~ /^Warning: Permanently added.*to the list of known hosts\./; 227 | print "$norig\t$_" if $_; 228 | } 229 | close $io; 230 | } 231 | 232 | wait; # don't exit until child is really done... not just done closing their pipe 233 | } 234 | 235 | # show usage 236 | sub usage { 237 | if ($opt{mode} eq 'ssh') { 238 | return <) { 313 | next if /^#/; 314 | next if /^\s*$/; 315 | my ($cmd, $suf, $dat) = m|^(\w+)(?:-([^: ]+))?\s*:\s*(.*)|; 316 | if ($cmd eq 'rewrite') { 317 | my ($from,$to) = $dat =~ /^\s*(.*?)\s+(?:to:)?\s*(.*?)\s*$/; 318 | push @rewrite, [$from, '"' . $to . '"']; 319 | } elsif ($cmd eq 'nodes' || $cmd eq 'node') { 320 | if (!$suf) { 321 | $defnodes .= "$dat "; 322 | } else { 323 | my @n=expandnodes($dat); 324 | push @{$groups{$suf}}, @n; 325 | } 326 | } elsif ($cmd eq 'file') { 327 | my ($path,$rule,$value,$rule2,$value2) = $dat =~ /^\s*(.*?)\s+(?:(\w+):\s*(.*?)\s*)+$/; 328 | push @{$filerules{$path}}, [$rule, $value]; 329 | push @{$filerules{$path}}, [$rule2, $value2] if $rule2; 330 | } 331 | } 332 | close IN; 333 | $defnodes =~ s/\s+$//; 334 | } 335 | 336 | sub expandnodes { 337 | my $dat = shift; 338 | my @n=split(/[\s,]+/, $dat); 339 | my @d; 340 | my @a; 341 | my @x; 342 | for my $n (@n) { 343 | my $not = 1 if $n =~ s/^\-//; 344 | next unless length($n)>0; 345 | 346 | if ($n eq '+all') { 347 | @a=expandnodes($defnodes); 348 | } elsif ($groups{$n}) { 349 | @a=@{$groups{$n}}; 350 | } else { 351 | $n = rewrite($n); 352 | @a=($n); 353 | } 354 | if ($not) { 355 | push @x, @a; 356 | } else { 357 | push @d, @a; 358 | } 359 | } 360 | for my $x (@x) { 361 | @d = grep !/^\Q$x\E$/, @d; 362 | } 363 | if (!$opt{seq}) { 364 | fisher_yates_shuffle(\@d) if @d; 365 | } 366 | return @d; 367 | } 368 | 369 | 370 | 371 | sub rewrite { 372 | my $n = shift; 373 | for my $rw (@rewrite) { 374 | my ($from, $to) = @$rw; 375 | $n =~ s/^$from$/$to/ee; 376 | die "bad rewrite: $from to: $to" if $n =~ /\$/; 377 | } 378 | return $n; 379 | } 380 | 381 | sub fisher_yates_shuffle 382 | { 383 | my $array = shift; 384 | my $i = @$array; 385 | while ( --$i ) 386 | { 387 | my $j = int rand( $i+1 ); 388 | @$array[$i,$j] = @$array[$j,$i]; 389 | } 390 | } 391 | -------------------------------------------------------------------------------- /grun: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # grun - lightweight jobs queueing system 4 | # Copyright (C) 2011 Erik Aronesty 5 | # 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | 19 | use strict; 20 | 21 | use Carp qw(carp croak confess cluck); 22 | use Getopt::Long qw(GetOptions); 23 | use Data::UUID; 24 | 25 | use ZMQ::LibZMQ3; 26 | use ZMQ::Constants ':all'; 27 | use JSON::XS; 28 | use Time::HiRes; 29 | use BSD::Resource; 30 | 31 | use IO::File; 32 | use POSIX qw(:sys_wait_h strftime); 33 | use Socket qw(IPPROTO_TCP TCP_NODELAY TCP_KEEPIDLE TCP_KEEPINTVL TCP_KEEPCNT); 34 | use Fcntl qw(F_GETFL F_SETFL O_NONBLOCK); 35 | use Safe; 36 | use Cwd qw(abs_path cwd); 37 | use List::Util qw(min max); 38 | use File::Basename qw(dirname); 39 | 40 | sub pretty_encode; 41 | 42 | our ($REVISION) = (q$LastChangedRevision: 15872 $ =~ /(\d+)/); 43 | 44 | our $VERSION = "0.9.$REVISION"; # 0.10 wil be feature lock, after sql impl. 0.9.X is zmq & json::xs 45 | 46 | my $STATUS_NEVERRUN=199; 47 | my $STATUS_ORPHAN=-8; 48 | my $STATUS_UNKNOWN=-9; 49 | my $STATUS_EXECERR=-10; 50 | my $PPID=$$; 51 | my $WIN32 = ($^O =~ /Win32/); 52 | my $TIMEFMT = 'command:%C\ncpu-real:%E\ncpu-user:%U\ncpu-sys:%S\nmem-max:%M\nmem-avg:%t\nctx-wait:%w\nfs-in:%I\nfs-out:%O'; 53 | 54 | my ($daemon, $killjob, $editjob); 55 | my (%conf, %def, @metrics, @labels); 56 | 57 | # defaults just run things locally, no master 58 | $def{config} = "/etc/grun.conf"; # config file 59 | $def{spool} = "/var/spool/grun"; # dir to place jobs 60 | $def{port} = 5184; # listen/connect port 61 | $def{bind} = '0.0.0.0'; # listen addr 62 | $def{env} = ['PATH']; # list of environment vars to copy from submit through to exec 63 | $def{default_memory} = 1000 * 1000; # default job memory 64 | $def{default_priority} = 20; # default job priority (20 = always run) 65 | $def{ping_secs} = 30; # how often to tell about load/mem/stats 66 | $def{remove_secs} = '$ping_secs * 1000'; # don't even try kickstarting if the node is this old 67 | $def{idle_load} = .3; # how often to tell about load/mem/stats 68 | $def{retry_secs} = 10; # how often to retry notifications 69 | $def{bench_secs} = 86400; # how often to re-benchmark 70 | $def{max_buf} = 1000000; # how often to retry notifications 71 | $def{expire_secs} = 14400; # remove jobs whose execution nodes haven't reported back in this amount of time 72 | $def{io_keep} = 3600; # keep io for this long after a job with i/o is finished in a detached session 73 | #$def{hard_factor} = 1.5; # hard limit factor 74 | $def{max_sched} = 50; # how many different jobs to try and match before giving up on the rest (queue busy) 75 | $def{spread_pct} = 5; # how often to "distribute jobs", versus "clump" them 76 | $def{master} = 'localhost:5184'; # central scheduler 77 | $def{services} = "queue exec"; # all can run 78 | $def{pid_file} = "/var/run/grun.pid"; # pid file 79 | $def{log_file} = "/var/log/grun.log"; # pid file 80 | $def{hostname} = $ENV{HOSTNAME} ? $ENV{HOSTNAME} : $ENV{COMPUTERNAME} ? $ENV{COMPUTERNAME} : `hostname`; 81 | $def{log_types} = "note error warn"; # log all 82 | $def{nfs_sync} = 1; # enable nfs sync support 83 | 84 | chomp $def{hostname}; 85 | 86 | sub debugging; 87 | 88 | my $GRUN_PATH=abs_path($0); 89 | 90 | my ($qinfo, $help, $config, $ver); 91 | 92 | Getopt::Long::Configure qw(require_order no_ignore_case passthrough); 93 | 94 | my $context = zmq_init(); 95 | 96 | my @ORIG_ARGV= @ARGV; 97 | 98 | GetOptions("daemon"=>\$daemon, "CONF:s"=>\$config, "trace"=>\$def{trace}, "query"=>\$qinfo, "V"=>\$ver, "help"=>\$help) || 99 | die usage(); 100 | 101 | (print "grun $VERSION\n") && exit(0) if $ver; 102 | 103 | my @send_files; # files to send 104 | 105 | my $safe = new Safe; 106 | 107 | $def{config} = $config if $config; 108 | 109 | init(); 110 | 111 | # this fixes issues on some systems, probably a tty thing 112 | system("echo . >> /dev/null") 113 | 114 | my $stream_quit = 0; 115 | 116 | if ($ARGV[0] eq '-X') { 117 | do_stream(); 118 | exit(0); 119 | } 120 | 121 | if ($ARGV[0] eq '-Y') { 122 | do_execute(); 123 | exit(0); 124 | } 125 | 126 | 127 | if ($ARGV[0] eq '-?') { 128 | shift @ARGV; 129 | $help = 1; 130 | } 131 | 132 | $help = 1 if defined $config && !$config; 133 | 134 | if ($help) { 135 | print usage(); 136 | exit 0; 137 | } 138 | 139 | if (!$daemon) { 140 | # -k works as long as -d wasn't specified 141 | GetOptions("kill"=>\$killjob, "trace"=>\$def{trace}, "edit|e"=>\$editjob) || 142 | die usage(); 143 | } 144 | 145 | if ($conf{debug_memory}) { 146 | eval {require Devel::Gladiator;}; 147 | die $@ if $@; 148 | } 149 | 150 | my $gjobid = slurp("$conf{spool}/nextid"); 151 | my $log_to_stderr = 0; 152 | 153 | if ($qinfo) { 154 | # this is the code for grun -q 155 | 156 | Getopt::Long::Configure qw(no_require_order no_ignore_case passthrough); 157 | my %opt; 158 | GetOptions(\%opt, "silent", "inplace", "hosts=s", "debug") || die usage(); 159 | my $cmd = shift @ARGV; 160 | die usage() if !$cmd; 161 | $log_to_stderr = 1 if $opt{debug}; 162 | 163 | my @arg = @ARGV; 164 | $cmd =~ s/^-//; 165 | my $tmp = substr bestunique($cmd, qw(config status jobs file history wait memory)), 0, 4; 166 | if (!$tmp) { 167 | die "Command $cmd is not available, for help type grun -query -?\n"; 168 | } 169 | $cmd = $tmp; 170 | 171 | # some commands default to localhost, others default to queue host... this is confusing... fix? 172 | 173 | my @dest = $opt{hosts} ? expandnodes($opt{hosts}) : 174 | $cmd eq 'conf' ? [$conf{bind}, $conf{port}] : 175 | [$conf{master},$conf{master_port}]; 176 | 177 | if ($cmd eq 'file' && @dest > 1) { 178 | die "Command $cmd cannot be run on multiple hosts"; 179 | } 180 | 181 | my $ok=0; 182 | for my $d (@dest) { 183 | my ($host, $port) = @$d; 184 | 185 | if ($cmd eq 'wait') { 186 | my $st = 0; 187 | for (@arg) { 188 | my ($res) = waitmsg($host, $port, "jwait", $_); 189 | if ($res && defined $res->{status}) { 190 | print "Job $_ status $res->{status}\n"; 191 | $st = $res->{status} if $res->{status}; 192 | } else { 193 | print "Job $_ status $STATUS_UNKNOWN\n"; 194 | $st = $STATUS_UNKNOWN; 195 | } 196 | } 197 | exit $st; 198 | } elsif ($cmd eq 'file') { 199 | my $cwd = cwd; 200 | my @need; 201 | for (@arg) { 202 | next if -e $_; 203 | if ($_ !~ /^\//) { 204 | $_ = "$cwd/$_"; 205 | } 206 | push @need, $_; 207 | } 208 | die "not supported yet\n"; 209 | # if (@need) { 210 | # my ($res, $error) = waitio({inplace=>$opt{inplace}}, $host, $port, "xcmd", 'file', @need); 211 | # die $error, "\n" if $error && !$opt{silent}; 212 | # exit 1 if $error; 213 | # } 214 | } elsif ((!samehost($host,$conf{hostname}) || (!$ENV{_GRUN} && (!$conf{services}->{queue} || $cmd !~ /^stat|jobs|hist$/)))) { 215 | # this could get ugly, if called a lot, may want to make more efficient 216 | warn ("waitmsg($host, $port, 'xcmd', $cmd, @arg, @{[%opt]})\n") if $opt{debug}; 217 | my ($ret) = waitmsg($host, $port, "xcmd", $cmd, @arg, %opt); 218 | print $ret; 219 | $ok=1 if $ret; 220 | } else { 221 | warn ("Using local queue status, host $host is $conf{bind}/$conf{hostip}, name is $conf{hostname} \n") if $opt{debug}; 222 | my $ret; 223 | if ($cmd eq 'stat') { 224 | $ret = shownodes(@arg); 225 | } elsif ($cmd eq 'jobs') { 226 | $ret = showjobs(@arg); 227 | } elsif ($cmd eq 'hist') { 228 | $ret = showhist(@arg); 229 | } 230 | print $ret; 231 | $ok=1 if $ret; 232 | } 233 | } 234 | exit($ok ? 0 : 1); 235 | } 236 | 237 | my $gpid; # daemon pid 238 | if (open(IN, $conf{pid_file})) { 239 | $gpid = ; 240 | close IN; 241 | } 242 | 243 | if ($killjob) { 244 | # grun -k code 245 | my $sig = 15; 246 | my $kforce = 0; 247 | Getopt::Long::Configure qw(no_require_order no_ignore_case); 248 | GetOptions("signal|n=i"=>\$sig, "force"=>\$kforce) || die usage(); 249 | 250 | my $exit = 0; 251 | for my $job (@ARGV) { 252 | my @id; 253 | if ($job !~ /^\d/) { 254 | @id=(guid=>"$job"); 255 | } else { 256 | @id=(jid=>$job); 257 | } 258 | my $err = kill_job(@id, sig=>$sig, force=>$kforce); 259 | if (!defined($err) && $@) { 260 | warn $@,"\n"; 261 | $exit=-1; 262 | } else { 263 | my $ok = ($err =~ /^Job.*(aborted|kill requested)/); 264 | $err =~ s/\n$//; 265 | warn "$err\n" if $ok; 266 | $err = 'No remote response to jkill' if !$ok && !$err; 267 | warn "Error: $err\n" if !$ok; 268 | $exit=-1 if !$ok; 269 | } 270 | } 271 | exit 0; 272 | } 273 | 274 | if ($editjob) { 275 | my %ed; 276 | while (@ARGV) { 277 | $_=$ARGV[0]; 278 | for (split /,/, $_) { 279 | my ($key, $val) = $_ =~ /^([^=]+)(?:=(.*))?$/; 280 | my $nk = bestunique($key, qw(hold resume memory cpus state hosts), @metrics, @labels); 281 | $key = $nk if $nk; 282 | $key = 'state', $val = 'hold' if ($key eq 'hold'); 283 | $key = 'state', $val = 'resume' if ($key eq 'resume'); 284 | if ($key eq 'state') { 285 | $val = substr bestunique($val, qw(hold resume)), 0, 4; 286 | die "' must be one of: h(old) r(esume)\n" unless $val; 287 | } 288 | $ed{$key}=$val; 289 | } 290 | shift; 291 | last unless $ARGV[0] =~ /=/; 292 | } 293 | my @jids = @ARGV; 294 | 295 | die usage() if !%ed || !@jids; 296 | 297 | my $ex = 0; 298 | for my $jid (@jids) { 299 | warn "Edit " . packdump(\%ed) . "\n"; 300 | my ($err) = waitmsg($conf{master}, $conf{master_port}, 'jedit', $jid, %ed); 301 | my $ok = ($err =~ /^Job.*edited/); 302 | $err=~ s/\n$//; 303 | warn "$err\n" if $ok; 304 | $err = 'No remote response to jedit' if !$ok && !$err; 305 | warn "Error: $err\n" if !$ok; 306 | $ex = 1 if !$ok; 307 | } 308 | exit $ex; 309 | } 310 | 311 | 312 | 313 | 314 | my ($router, $read_set, $write_set, $quit, %pid_jobs, %j_wait, %io_wait, %start_wait); # daemon globals 315 | my %ZMQS; # hash of open sockets 316 | my %nodes; # hash of registered nodes 317 | 318 | if ($daemon) { 319 | startdaemon(); 320 | } else { 321 | grun_client(); 322 | } 323 | 324 | #################### 325 | # client mode 326 | 327 | my $make; 328 | my %sync_after; 329 | my %sync_before; 330 | my %sync_already; 331 | 332 | sub grun_client { 333 | my %jobs; 334 | my %opt; 335 | 336 | $opt{wait} = 1; # keep socket open until job is finished 337 | $opt{io} = 1; # copy io back on the socket, implies wait 338 | 339 | Getopt::Long::Configure qw(require_order no_ignore_case passthrough); 340 | 341 | GetOptions(\%opt, "file=s", "int|I", "memory|m=i", "hosts|h=s", "cpus|c=f", "io!", "wait!", "err_a|err-append|ea=s", "err|e=s", "out|o=s", "out_a|out-append|oa=s", "ouer|out-err|oe=s", "ouer_a|out-err-append|oea=s", "jobx|jobid|j=s", "verbose", "make|M", "debug|D", "env|E=s", "alert=s", "param|p=s@", "wait-exists|W", "priority|r=i"); 342 | 343 | if ((!!$opt{out} + !!$opt{out_a} + !!$opt{ouer} + !!$opt{ouer_a})>1) { 344 | $config=undef; 345 | die "ERROR: Specify only one of --out, --out-append, --out-err or --out-err-append\n\n" . usage() 346 | } 347 | 348 | if ((!!$opt{err} + !!$opt{err_a} + !!$opt{ouer} + !!$opt{ouer_a})>1) { 349 | $config=undef; 350 | die "ERROR: Specify only one of --err, --err-append, --out-err or --out-err-append\n\n" . usage() 351 | } 352 | 353 | if (my $t=$opt{out_a}?$opt{out_a}:$opt{ouer_a}) { 354 | $opt{out}=$t; 355 | $opt{out_a}=1; 356 | delete $opt{ouer_a}; 357 | } 358 | if (my $t=$opt{err_a}?$opt{err_a}:$opt{ouer_a}) { 359 | $opt{err}=$t; 360 | $opt{err_a}=1; 361 | delete $opt{ouer_a}; 362 | } 363 | if ($opt{ouer}) { 364 | $opt{out}=$opt{err}=$opt{ouer}; delete $opt{ouer}; 365 | } 366 | 367 | my $verbose = $opt{verbose}; delete $opt{verbose}; 368 | my $env = $opt{env}; delete $opt{env}; 369 | $make = $opt{make}; delete $opt{make}; 370 | $log_to_stderr = 1 if $opt{debug}; 371 | 372 | if ($< == 0) { 373 | die "Won't run a job as root\n"; 374 | } 375 | 376 | if ($opt{err} && $opt{out}) { 377 | $opt{io} = 0; 378 | } 379 | 380 | while ($ARGV[0] =~ /^--([\w-]+)=([\w=]+)/) { 381 | # allow arbitrary job options, that jan later be referred to in match expressions 382 | # or in execution wrappers, etc 383 | $opt{$1} = $2; 384 | shift @ARGV; 385 | } 386 | 387 | if ($make || $verbose) { 388 | GetOptions(\%opt, "noexec"); 389 | } 390 | 391 | my @cmd = @ARGV; 392 | 393 | if ($opt{file}) { 394 | # read options from a file 395 | funpack($opt{file}, \%opt); 396 | if ($opt{cmd}) { 397 | if (@ARGV) { 398 | die "Can't supply cmd: in the file and '@cmd' on the command line\n"; 399 | } 400 | if ($opt{cmd} !~ /^[\w0-9:\/\t -]+$/) { 401 | # not simple: let bash handle it 402 | @cmd = ('bash', '-c', $opt{cmd}); 403 | } else { 404 | # simple: split, pass as is to exec 405 | $opt{cmd} =~ s/^\s+//; 406 | $opt{cmd} =~ s/\s+$//; 407 | @cmd = split /\s+/, $opt{cmd}; 408 | } 409 | } 410 | } 411 | 412 | # force exec in "same as current dir" 413 | $opt{cwd} = cwd; 414 | if (!$opt{cwd}) { 415 | die "Can't get current working directory, not executing unanchored remote command.\n"; 416 | } 417 | 418 | if ($make) { 419 | # %i:input %o:output 420 | my (@i, @o); 421 | for (@cmd) { 422 | my @t = m/[#%]([io]):(\S+)/g; 423 | if (@t) { 424 | for (my $i=0;$i<@t;++$i) { 425 | push @o, $t[$i+1] if $t[$i] eq 'o'; 426 | push @i, $t[$i+1] if $t[$i] eq 'i'; 427 | } 428 | s/%[io]:(\S+)/$1/g; 429 | s/#[io]:\S+//g; 430 | } 431 | my @t = m/([<>])\s*(\S+)/g; 432 | if (@t) { 433 | for (my $i=0;$i<@t;$i+=2) { 434 | push @i, $t[$i+1] if $t[$i] eq '<' && $t[$i+1]=~/^\s*\w/; 435 | push @o, $t[$i+1] if $t[$i] eq '>' && $t[$i+1]=~/^\s*\w/; 436 | } 437 | } 438 | s/\%\!\>(>?\s*\S+)/>$1/g; 439 | } 440 | die "Unable to determine i/o for -M (make) semantics\n\n" . usage() 441 | if !@i || !@o; 442 | 443 | my $need=0; 444 | for my $i (@i) { 445 | syncfile($i); 446 | add_syncfile_before($i); 447 | for my $o (@o) { 448 | syncfile($o); 449 | if (! (-s $o) || (fmodtime($i) > fmodtime($o))) { 450 | warn "# need $o\n" if $opt{noexec} || $verbose; 451 | $need=1; 452 | } 453 | } 454 | } 455 | if (!$need) { 456 | warn "Skipping: @cmd\n"; 457 | exit 0; 458 | } 459 | for (@o) { 460 | add_syncfile_after($_); 461 | } 462 | warn "+@cmd\n" if $opt{noexec} || $verbose; 463 | } else { 464 | for (@cmd) { 465 | if (m{([^\s,:]+)/([^\s,:]+)}) { 466 | add_syncfile_after($_); 467 | add_syncfile_before($_); 468 | } 469 | } 470 | add_syncdir_after($opt{cwd}); 471 | add_syncdir_before($opt{cwd}); 472 | } 473 | 474 | if ($ARGV[0] =~ /^-/) { 475 | die "Unknown option $ARGV[0]\n"; 476 | } 477 | 478 | 479 | #die pretty_encode \@cmd if $opt{debug}; 480 | 481 | if (!@cmd) { 482 | die usage(); 483 | } 484 | 485 | if ($cmd[$#cmd] =~ /\&$/) { 486 | # TODO: grun should wait for all kids, and disallow detaching, not just get rude here 487 | die "Not running background-only job. You might mean: grun \"command\" &.\n"; 488 | } 489 | 490 | if ($conf{auto_profile}) { 491 | if (-e ($conf{auto_profile})) { 492 | my $cmd = join ' ', @cmd; 493 | # safe'ish eval, just so there aren't weird side effects 494 | my ($cpu, $mem, $prof) = evalctx(slurp($conf{auto_profile}) . ";\nreturn (\$cpu, \$mem, \%prof);", cmd=>$cmd, cmd=>\@cmd); 495 | 496 | $prof = $cpu if ref($cpu) eq 'HASH'; 497 | 498 | # alias names 499 | $prof->{cpus}=$prof->{cpu} if !$prof->{cpus} && $prof->{cpu}; 500 | $prof->{memory}=$prof->{mem} if !$prof->{memory} && $prof->{mem}; 501 | 502 | if ($prof && ref($prof) eq 'HASH') { 503 | $prof->{memory} = $mem if defined($mem) && !ref($mem) && !$prof->{memory}; 504 | $prof->{cpus} = $cpu if defined($cpu) && !ref($cpu) && !$prof->{cpus}; 505 | 506 | $opt{memory}=$prof->{memory} if $prof->{memory} && !$opt{memory}; 507 | $opt{cpus}=$prof->{cpus} if $prof->{cpus} && !$opt{cpus}; 508 | $opt{hosts}=$prof->{hosts} if $prof->{hosts} && !$opt{hosts}; 509 | $opt{priority}=$prof->{priority} if $prof->{priority} && !$opt{priority}; 510 | 511 | for ((@metrics,@labels)) { 512 | # as if the user entered it 513 | next if !$_; 514 | push @{$opt{"param"}}, "$_=" . $prof->{$_} if defined($prof->{$_}); 515 | } 516 | } else { 517 | $opt{memory}=$mem if ($mem > $opt{memory}); 518 | $opt{cpus}=$cpu if ($cpu > $opt{cpus}); 519 | } 520 | 521 | if ($@) { 522 | die "Can't run $conf{auto_profile}: $@\n"; 523 | } 524 | } else { 525 | die "Can't find $conf{auto_profile}: $!\n"; 526 | } 527 | } 528 | 529 | my %param; 530 | if ($opt{param}) { 531 | for (@{$opt{param}}) { 532 | if (/^([^=]+)=(.*)/) { 533 | $param{$1} = $2; 534 | } else { 535 | die "Parameter $_: should be name=value\n"; 536 | } 537 | } 538 | } 539 | $opt{param} = \%param; 540 | 541 | $opt{priority} = $ENV{GRUN_PRIORITY}+0 if !$opt{priority} && $ENV{GRUN_PRIORITY} >= 1; 542 | $opt{priority} = $conf{default_priority} if !$opt{priority}; 543 | 544 | # convert memory to kB 545 | if ($opt{memory}) { 546 | if ($opt{memory} =~ /kb?$/i) { 547 | # internal memory unit is kB 548 | } elsif ($opt{memory} =~ /gb?$/i) { 549 | # convert gB to kB 550 | $opt{memory} *= 1000000; 551 | } else { 552 | # convert mB to kB 553 | $opt{memory} *= 1000; 554 | } 555 | } else { 556 | $opt{memory} = $conf{default_memory}; 557 | } 558 | 559 | # no socket io unless waiting 560 | if (!$opt{wait}) { 561 | delete $opt{io}; 562 | } 563 | 564 | 565 | # copy env 566 | if ($env eq '*' || ($conf{env} && ($conf{env}->[0] eq '*')) ) { 567 | for (keys %ENV) { 568 | if (! /^_|LS_COLORS/) { 569 | $opt{env}->{$_} = $ENV{$_}; 570 | } 571 | } 572 | } else { 573 | for (@{$conf{env}}) { 574 | $opt{env}->{$_} = $ENV{$_} if defined $ENV{$_}; 575 | } 576 | for (split /\s+/, $env) { 577 | $opt{env}->{$_} = $ENV{$_} if defined $ENV{$_}; 578 | } 579 | } 580 | 581 | $opt{user} = getpwuid($>); 582 | $opt{group}=$); 583 | $opt{umask} = umask(); 584 | $opt{env}->{USER} = $opt{user}; 585 | 586 | if (!$opt{wait}) { 587 | open STDERR, ">&STDOUT"; 588 | } 589 | 590 | $opt{memory} = $opt{memory} ? $opt{memory} : $conf{default_memory}; 591 | $opt{cpus} = $opt{cpus} ? $opt{cpus} : 1; 592 | 593 | for (@metrics) { 594 | if ($conf{"default_$_"}) { 595 | $opt{$_} = $opt{$_} ? $opt{$_} : $conf{"default_$_"}; 596 | } 597 | } 598 | 599 | if ($verbose) { 600 | printf STDERR "Memory: %d\n", $opt{memory}; 601 | printf STDERR "CPUs: %d\n", $opt{cpus}; 602 | printf STDERR "Hosts: %s\n", $opt{hosts} if $opt{hosts}; 603 | for ((@metrics,@labels)) { 604 | printf STDERR proper($_) . ": %s\n", $opt{param}->{$_} ? $opt{param}->{$_} : 1; 605 | } 606 | } 607 | 608 | if ($opt{jobx}) { 609 | if ($opt{jobx} =~ /^\d/) { 610 | die "External job id's should start with a non-numeric\n"; 611 | } 612 | } 613 | 614 | my %info; 615 | 616 | sub client_sigh { 617 | my $signame = shift; 618 | $SIG{INT} = undef; 619 | $SIG{TERM} = undef; 620 | $SIG{PIPE} = undef; 621 | if ($info{jid}||$info{guid}) { 622 | if (!($signame eq 'PIPE')) { 623 | print STDERR "Aborting command, sending jkill for $info{jid}\n"; 624 | } 625 | my $code = $signame eq 'INT' ? 2 : $signame eq 'PIPE' ? 13 : 15; 626 | my $err = kill_job(jid=>$info{jid}, guid=>$opt{guid}, sig=>$code, termio=>1); 627 | if (!($signame eq 'PIPE')) { 628 | if (!defined($err) && $@) { 629 | warn $@,"\n"; 630 | } 631 | } 632 | exit 128+$code; 633 | } else { 634 | die "Interrupted before job sent\n"; 635 | } 636 | }; 637 | 638 | # for testing -M make 639 | exit 0 if $opt{noexec}; 640 | 641 | $SIG{INT} = \&client_sigh; 642 | $SIG{TERM} = \&client_sigh; 643 | $SIG{PIPE} = \&client_sigh; 644 | 645 | $opt{cmd} = \@cmd; 646 | $opt{hard_factor} = $conf{hard_factor}; 647 | $opt{frompid} = $$; 648 | $opt{guid} = $opt{jobx} ? $opt{jobx} : create_guid(); 649 | $opt{syncdirs} = [keys(%sync_before)]; 650 | %info = waitmsg($conf{master}, $conf{master_port}, 'run', \%opt); 651 | 652 | if (!%info) { 653 | die ($@ ? $@ : "No response to 'run'") . "\n"; 654 | } 655 | 656 | if ($info{error}) { 657 | print STDERR $info{error}, "\n"; 658 | exit -1; 659 | } 660 | 661 | if (!$info{jid}) { 662 | print STDERR "Failed to submit job", "\n"; 663 | exit -1; 664 | } 665 | my $save_jid = $info{jid}; 666 | 667 | $0 = "GRUN:$save_jid"; 668 | 669 | if ($verbose) { 670 | printf STDERR "Job_ID: $info{jid}\n"; 671 | } 672 | 673 | if ($info{already}) { 674 | if ($opt{"wait-exists"}) { 675 | my ($res) = waitmsg($conf{master},$conf{master_port}, "jwait", $info{jid}); 676 | my $st; 677 | if ($res && defined $res->{status}) { 678 | print "Job $_ status $res->{status}\n"; 679 | $st = $res->{status} if $res->{status}; 680 | } else { 681 | print "Job $_ status $STATUS_UNKNOWN\n"; 682 | $st = $STATUS_UNKNOWN; 683 | } 684 | exit $st; 685 | } else { 686 | print STDOUT "Job_ID: $info{jid} \n" if !$verbose; 687 | printf STDERR "Already running job named $opt{jobx}, try grun -q wait JOB\n"; 688 | exit -1; 689 | } 690 | } 691 | 692 | if ($opt{wait}) { 693 | # wait for a job ip 694 | while (!defined($info{ip})) { 695 | my %tmp = waitmsg_retry($conf{retry_secs}*1000, $conf{master}, $conf{master_port}, 'jinfo', $info{jid}); 696 | 697 | if ($tmp{error}) { 698 | print STDERR $tmp{error}, "\n"; 699 | exit $tmp{status} != 0 ? $tmp{status} : -1; 700 | } 701 | 702 | if (!defined($tmp{ip})) { 703 | xlog("error","Had to retry after a retry... BUG in job $save_jid\n"); 704 | sleep(5); 705 | } else { 706 | %info=%tmp; 707 | } 708 | } 709 | 710 | my $host = $info{hostname} ? $info{hostname} : $info{ip}; # support old ver which didn't have hostname 711 | 712 | if ($verbose) { 713 | printf STDERR "Host: %s\n", $host; 714 | } 715 | 716 | # look at all the work you can avoid if you don't wait 717 | my ($stat, $err, $diderr); 718 | # connect to executing node directly and ask for stderr, stdout, and status based on job id 719 | if (defined $info{status}) { 720 | $stat=$info{status}; 721 | $err=$info{error}; 722 | } 723 | while (!defined $stat && !defined $err) { 724 | # shadow watcher.... 725 | if ($info{ip}=~/^\d/) { 726 | if ($opt{io} && !($opt{err} && $opt{out}) ) { 727 | ($stat, $err, $diderr) = waitio($info{ip}, $info{port}, $info{jid}, \%opt); 728 | } else { 729 | my ($key, $dat) = waitmsg_retry($conf{retry_secs}*1000, $info{ip}, $info{port}, 'xstat', $info{jid}); 730 | $stat=$dat->{status}; 731 | $err=$dat->{error}; 732 | } 733 | if ($stat == 65280 && !$err) { 734 | print STDERR "Error: [$host] Command returned -1\n"; 735 | } 736 | } else { 737 | $stat=$STATUS_UNKNOWN; 738 | $err="Error in grun protocol (ip=$info{ip}), unknown status!\n"; 739 | } 740 | sleep 5 if (!defined $stat && !defined $err); 741 | } 742 | 743 | # send this command into the ether...if exec doesn't get it, clean up after timeout 744 | if ($opt{io}||$opt{wait}) { 745 | if ($info{ip}=~/^\d/) { 746 | sendcmd($info{ip}, $info{port}, 'xclean', $info{jid}); 747 | } 748 | } 749 | if ($stat == 11) { 750 | $err = 'Segmentation fault'; 751 | } 752 | if ($stat > 127) { 753 | # shift... but if that doesn't get you anything, set to -1 (unknown error) 754 | $stat = $stat>>8; 755 | $stat = -1 if !$stat; 756 | } 757 | syncafter(); 758 | if ($err) { 759 | print STDERR "[$host] $err", "\n"; 760 | $stat = -1 if !$stat; # unknown error if undefined 761 | } else { 762 | if ($stat != 0 && ! $diderr) { 763 | print STDERR "Error: [$host] Command returned $stat\n"; 764 | } 765 | } 766 | unlink("$ENV{HOME}/.grun/jobx/$opt{jobx}") if $opt{jobx}; 767 | 768 | exit($stat); 769 | } else { 770 | # aah... nicer 771 | print STDOUT "Job_ID: $info{jid} \n" if !$verbose; 772 | } 773 | } 774 | 775 | ### nfs sync support 776 | 777 | sub syncdirs { 778 | for (@_) { 779 | syncdir($_); 780 | } 781 | } 782 | 783 | sub syncafter { 784 | %sync_already = (); 785 | return unless $conf{nfs_sync}; 786 | syncdirs(keys(%sync_after)); 787 | } 788 | 789 | sub add_syncdir_after { 790 | my ($d) = @_; 791 | return unless $conf{nfs_sync}; 792 | $sync_after{abs_path($d)}=1; 793 | } 794 | 795 | sub add_syncdir_before { 796 | my ($d) = @_; 797 | return unless $conf{nfs_sync}; 798 | $sync_before{abs_path($d)}=1; 799 | } 800 | 801 | sub add_syncfile_before { 802 | my ($f) = @_; 803 | return unless $conf{nfs_sync}; 804 | add_syncdir_before(-d $f ? $f : dirname($f)); 805 | } 806 | 807 | sub add_syncfile_after { 808 | my ($f) = @_; 809 | return unless $conf{nfs_sync}; 810 | add_syncdir_after(-d $f ? $f : dirname($f)); 811 | } 812 | 813 | sub syncfile { 814 | my ($f) = @_; 815 | return unless $conf{nfs_sync}; 816 | syncdir(-d $f ? $f : dirname($f)); 817 | } 818 | 819 | # this refreshes the lookupcasche, which is an issue when running scripts back to back on multiple nodes using NFS 820 | sub syncdir { 821 | my ($d) = @_; 822 | my $tmp; 823 | return if $sync_already{$d}; 824 | opendir($tmp,$d); 825 | closedir($tmp); 826 | } 827 | 828 | sub readsocks { 829 | my $did; 830 | # identity & data received 831 | my $cnt; 832 | 833 | if (!$router) { 834 | xlog("debug", "Readsocks called with no router: " . Carp::longmess() ); 835 | return; 836 | } 837 | 838 | while (my $id= zmq_recvmsg($router,ZMQ_NOBLOCK)) { 839 | ++$cnt; 840 | $did=1; 841 | $id = zmq_msg_data($id); 842 | 843 | if (zmq_getsockopt($router,ZMQ_RCVMORE)) { 844 | # print "getting sep\n"; 845 | my $sep = zmq_recvmsg($router); 846 | } 847 | if (zmq_getsockopt($router,ZMQ_RCVMORE)) { 848 | # print "getting data\n"; 849 | my $msg = zmq_recvmsg($router); 850 | my $data = zmq_msg_data($msg); 851 | my @resp = process_message($data, $id); 852 | if (@resp) { 853 | print("Got resp: @resp\n"); 854 | replymsg($id, @resp); 855 | } else { 856 | # warn("No resp for $data\n"); 857 | } 858 | } 859 | while (zmq_getsockopt($router,ZMQ_RCVMORE)) { 860 | print("Discarding excess multipart data!\n"); 861 | } 862 | if ($cnt > 100000) { 863 | xlog("error", "Getting flooded with messages"); 864 | last; 865 | } 866 | } 867 | return $did; 868 | } 869 | 870 | sub jhiststat { 871 | my ($jid) = @_; 872 | my $jhistfile=jhistpath($jid); 873 | if (-e $jhistfile) { 874 | # job is toast... but maybe some streams are waiting 875 | my $job=unpack_file($jhistfile); 876 | delete $job->{env}; 877 | $job->{host}="n/a" if !$job->{host} && !defined $job->{status}; 878 | $job->{ip}=host2ip($job->{host}) if !$job->{ip} && ! defined $job->{status}; 879 | # needs ip! 880 | $job->{ip}="n/a" if defined $job->{status} && ! $job->{ip}; 881 | $job->{hostname}=$job->{host}; 882 | return $job; 883 | } 884 | return undef; 885 | } 886 | 887 | sub replymsg { 888 | my ($id, @resp) = @_; 889 | if (debugging) { 890 | my $hid=unpack("h*",$id); 891 | xlog("debug", "Reply ($hid) " . packdump(\@resp) . "\n") if $conf{trace}; 892 | } 893 | zmq_send($router, $id, length($id), ZMQ_SNDMORE); 894 | zmq_send($router, "", 0, ZMQ_SNDMORE); 895 | zmq_send($router, packref(\@resp)); 896 | } 897 | 898 | sub selfcmd { 899 | my $cmd = packcmd(@_); 900 | process_message($cmd, undef); 901 | } 902 | 903 | my $debugzid; 904 | sub process_message { 905 | my ($src_data, $zid) = @_; 906 | 907 | my ($ip, $trace, $cmd, @args) = unpackcmd($src_data); 908 | 909 | $trace=$conf{trace} if !$trace; 910 | 911 | if (debugging|$trace) { 912 | my $hid=defined($zid) ? unpack("h*",$zid) : ""; 913 | xlog($trace ? "trace" : "debug", "Received command ($hid) '$cmd' : $src_data\n") if $trace; 914 | } 915 | 916 | return ('error'=>$@) if ($@); 917 | 918 | if ($cmd eq 'xcmd') { 919 | # these commands 'query or interfere' with normal running of the server 920 | # they are initiated by a user 921 | # they are limped together like this for basically no reason 922 | 923 | if ($args[0] eq 'relo') { 924 | # reread config... maybe rebind stuff too 925 | xlog("note", "Reload from remote command (ARGS: @ORIG_ARGV)"); 926 | eval{init();}; 927 | if ($@) { 928 | return "Error: $conf{config}, $@"; 929 | } else { 930 | return "Ok, reloaded from $conf{config}"; 931 | } 932 | } elsif ($args[0] eq 'term') { 933 | xlog("note", "Shutdown from remote command"); 934 | $quit = 1; 935 | return 'Ok, shutdown initiated'; 936 | } elsif ($args[0] eq 'rest') { 937 | xlog("note", "Restarting from remote command ($GRUN_PATH @ORIG_ARGV)"); 938 | $quit = 1; 939 | zmq_unbind($router, "tcp://$conf{bind}:$conf{port}"); 940 | if (!fork) { 941 | zmq_fork_undef(); 942 | exec($GRUN_PATH, @ORIG_ARGV); 943 | } 944 | return "Ok, restart initiated"; 945 | } elsif ($args[0] eq 'stat') { 946 | shift @args; 947 | return shownodes(@args); 948 | } elsif ($args[0] eq 'hist') { 949 | shift @args; 950 | # if (@args && (@args > 1 || $args[0] !~ /^\d+$/)) { 951 | # fork... do it in parallel 952 | # forkandgo($zid, \&showhist, @args); 953 | # return(); 954 | # } else { 955 | # inline... do it now, less expensive than fork! 956 | return showhist(@args); 957 | # } 958 | } elsif ($args[0] eq 'conf') { 959 | return showconf(); 960 | } elsif ($args[0] eq 'memo') { 961 | return showmem(); 962 | } elsif ($args[0] eq 'jobs') { 963 | shift @args; 964 | return showjobs(@args); 965 | # } elsif ($args[0] eq 'file') { 966 | # shift @args; 967 | # xlog("note", "Sending file [@args] to remote"); 968 | # return ($SOCK_FILE, @args); 969 | } else { 970 | return "Error: unknown xcmd '$args[0]'"; 971 | } 972 | } elsif ($cmd eq 'frep') { 973 | # warn("router is $router, zid is $debugzid\n"); 974 | my ($dat) = @args; 975 | if ($dat->{zid}) { 976 | xlog("debug", "Sending response to $dat->{zid}"); 977 | replymsg(pack("h*",$dat->{zid}),$dat->{out},$dat->{more}); 978 | #replymsg($debugzid,$dat->{out}); 979 | } 980 | } elsif ($cmd eq 'node') { 981 | # this is the 'node ping' 982 | if (ref($args[0]) eq 'HASH') { # bit of validation 983 | my $node = $args[0]; 984 | 985 | $node->{ip}=$ip unless $node->{ip}; 986 | $ip=$node->{ip}; 987 | 988 | if ($ip) { 989 | my $file = "$conf{spool}/nodes/$ip.reg"; 990 | open(F, ">$file") || return("Error: can't create $file : $!"); 991 | print F packfile($node); 992 | close F; 993 | # also stick in memory 994 | 995 | if (!$nodes{$ip}) { 996 | xlog("note", "Registering node $ip:$node->{port} $node->{hostname}"); 997 | } 998 | 999 | $node->{ping} = time(); 1000 | $node->{ex_ping} = $nodes{$ip}->{ping}; 1001 | $node->{zid} = $zid; 1002 | 1003 | $nodes{$ip} = $node; 1004 | } else { 1005 | xlog("note", "Can't register node with no ip"); 1006 | } 1007 | # save execution node ping time for diagnostic (times out of sync or very slow transmission, etc) 1008 | } else { 1009 | return "Error: invalid node registration info"; 1010 | } 1011 | return (); 1012 | } elsif ($cmd eq 'xedit') { 1013 | my ($jid, $ed) = @args; 1014 | if ($ed->{state} =~ /hold|resu/) { 1015 | my $job = unpack_file("$conf{spool}/jpids/$jid"); 1016 | 1017 | if ($job && $job->{pid}) { 1018 | my ($sig, $stat); 1019 | if ($ed->{state} eq 'hold') { 1020 | $sig=-19; $stat="susp"; 1021 | } else { 1022 | $sig=-18; $stat=""; 1023 | } 1024 | kill($sig, $job->{pid}); 1025 | sendcmd($conf{master},$conf{master_port}, 'jedit', $job->{id}, state=>$stat); 1026 | } 1027 | } 1028 | } elsif ($cmd eq 'jedit') { 1029 | my ($jid, %ed) = @args; 1030 | my $fname; 1031 | if (!$jid || !%ed) { 1032 | xlog("error", "Invalid edit request (@args)"); 1033 | return "Invalid edit request (@args)\n"; 1034 | } elsif (! -e "$conf{spool}/jobs/$jid") { 1035 | if (! -e "$conf{spool}/jobs/$jid.ip") { 1036 | xlog("error", "Job $jid not found during jedit"); 1037 | return "Job $jid not found during jedit"; 1038 | } else { 1039 | my $job_ip = slurp("$conf{spool}/jobs/$jid.ip"); 1040 | # send an 'xedit' to the running node 1041 | # needed only for certain edits....think about this! 1042 | $fname = "$conf{spool}/jobs/$jid:$job_ip.run"; 1043 | if ($ed{state} =~ /hold|resu/) { 1044 | if ($nodes{$job_ip}) { 1045 | sendcmd_nowait($job_ip, $nodes{$job_ip}->{port}, 'xedit', $jid, \%ed); 1046 | } 1047 | } 1048 | # for now we allow edit-in-place.... for no effect in some cases, but not others 1049 | } 1050 | } else { 1051 | $fname = "$conf{spool}/jobs/$jid"; 1052 | } 1053 | 1054 | # assert($fname) 1055 | my $ref = unpack_file($fname); 1056 | for my $key (keys(%ed)) { 1057 | $ref->{$key} = $ed{$key}; 1058 | } 1059 | burp("$conf{spool}/jobs/$jid.jedit", packfile($ref)); 1060 | rename("$conf{spool}/jobs/$jid.jedit", $fname); 1061 | return "Job $jid edited\n"; 1062 | } elsif ($cmd eq 'jwait') { 1063 | my ($job) = @args; 1064 | 1065 | # user can specify guid or jid 1066 | my $jid = jid_from_opts($job=~/^\d/?{jid=>$job}:{guid=>$job}); 1067 | if (!$jid) { 1068 | return {error=>"Job $job not found"}; 1069 | } 1070 | if ( -e "$conf{spool}/jobs/$jid" || -e "$conf{spool}/jobs/$jid.ip" ) { 1071 | $j_wait{$jid}->{$zid}=time(); 1072 | return (); 1073 | } else { 1074 | my $jhistfile=jhistpath($jid); 1075 | if (-e $jhistfile) { 1076 | # repeated acks are ok 1077 | my $job=unpack_file($jhistfile); 1078 | if ($job) { 1079 | return $job; 1080 | } else { 1081 | return {error=>"Invalid job history $jhistfile"}; 1082 | } 1083 | } else { 1084 | return {error=>"Job not running, and has no history"}; 1085 | } 1086 | } 1087 | } elsif ($cmd eq 'jkill') { 1088 | # this is the 'job kill command' 1089 | my ($job) = @args; 1090 | 1091 | # user can specify guid or jid 1092 | my $jid = jid_from_opts($job); 1093 | if (!$jid) { 1094 | if ($job->{guid}) { 1095 | return "Job $job->{guid} not found\n"; 1096 | } else { 1097 | return "No job specified for jkill\n"; 1098 | } 1099 | } 1100 | if (! -e "$conf{spool}/jobs/$jid") { 1101 | if (! -e "$conf{spool}/jobs/$jid.ip") { 1102 | xlog("error", "Job $jid not running or queued, but kill requested"); 1103 | return "Job $jid not running or queued"; 1104 | } else { 1105 | # send xabort to correct node 1106 | my $job_ip = slurp("$conf{spool}/jobs/$jid.ip"); 1107 | if (!$job_ip) { 1108 | xlog("error", "Job $jid empty ip!"); 1109 | my $job = unpack_file("$conf{spool}/jobs/$jid"); 1110 | $job->{error}="Unknown state"; 1111 | archive_job($jid, $job, $STATUS_UNKNOWN, undef, undef); 1112 | return "Job $jid, unknown state"; 1113 | } else { 1114 | if ($job->{force}) { 1115 | xlog("error", "Job $jid forced kill"); 1116 | my $job = unpack_file("$conf{spool}/jobs/$jid:$job_ip.run"); 1117 | if ($nodes{$job_ip}) { 1118 | sendcmd_nowait($job_ip, $nodes{$job_ip}->{port}, 'xabort', $jid, 9); 1119 | } 1120 | $job->{error}="Forced kill"; 1121 | archive_job($jid, $job, $STATUS_UNKNOWN, undef, undef); 1122 | return "Job $jid forced kill\n"; 1123 | } else { 1124 | if ($nodes{$job_ip}) { 1125 | return "Forward $jid $job_ip:" . $nodes{$job_ip}->{port}; 1126 | } else { 1127 | return "Job $jid, node $job_ip not online, can't kill\n"; 1128 | } 1129 | } 1130 | } 1131 | } 1132 | } else { 1133 | my $job = unpack_file("$conf{spool}/jobs/$jid"); 1134 | $job->{error}="Killed"; 1135 | archive_job($jid, $job, $STATUS_NEVERRUN, undef, undef); 1136 | return "Job $jid aborted"; 1137 | } 1138 | } elsif ($cmd eq 'jstat') { 1139 | # sent by the execution node to say what the status of a job is 1140 | my %opts = %{$args[0]}; 1141 | if (my $jid = $opts{id}) { 1142 | my $should = slurp("$conf{spool}/jobs/$opts{id}.ip"); 1143 | 1144 | if (! -e "$conf{spool}/jobs/$opts{id}:$ip.run") { 1145 | if (!$should) { 1146 | # this could be a repeat, and that's ok 1147 | xlog("debug", "Probably already got 'jstat' for $jid"); 1148 | } else { 1149 | if ($opts{pid}<0) { 1150 | xlog("error", "Orphaned job report $jid, from $ip for $should, status: $opts{status}"); 1151 | $ip=$should if $opts{status} == $STATUS_ORPHAN; 1152 | } else { 1153 | if (!($ip eq $should)) { 1154 | xlog("error", "Got a report ($opts{status},$opts{jrun}) for job $jid from $ip, should be $should"); 1155 | } else { 1156 | xlog("error", "Got a report ($opts{status},$opts{jrun}) for job $jid, but there's no $jid:$ip.run file"); 1157 | } 1158 | } 1159 | } 1160 | } 1161 | if ($opts{jrun}) { 1162 | # just a ping 1163 | xlog("trace", "Still alive $jid from $ip") if $trace; 1164 | touch("$conf{spool}/jobs/$jid:$ip.run") 1165 | if -e "$conf{spool}/jobs/$jid:$ip.run"; 1166 | if ( ! -e "$conf{spool}/jobs/$jid.ok" ) { 1167 | # jexok didn't come through, but should have 1168 | xlog("error", "Writing $jid.ok, from jrun signal, may need to restart exec node"); 1169 | burp("$conf{spool}/jobs/$jid.ok","jrun"); 1170 | } 1171 | return(); # no dacks for jrun 1172 | } elsif (defined $opts{status}) { 1173 | # job is done 1174 | if ( -e "$conf{spool}/jobs/$jid:$ip.run" ) { 1175 | my $job = unpack_file("$conf{spool}/jobs/$jid:$ip.run"); 1176 | if ($job) { 1177 | if (my $n=$nodes{$ip}) { 1178 | xlog("debug", "Clearing cpu: $job->{cpus}, memory: $job->{memory} from $ip"); 1179 | $n->{a_cpus} -= $job->{cpus}; # deallocate 1180 | $n->{a_memory} -= $job->{memory}; # deallocate 1181 | for (@metrics) { 1182 | $n->{"a_$_"} -= $job->{param}->{$_}; 1183 | } 1184 | # TODO: probably would be nice, in a jstat, to include metrics so all these aren't guessing 1185 | 1186 | $n->{avail} += $job->{cpus}; # return to avail, until %node is updated 1187 | $n->{mem} += $job->{memory}; # return mem 1188 | $n->{avail} = max($n->{avail},min($n->{cpus},$n->{orig_cpus}-$n->{load})); # better guess 1189 | } else { 1190 | xlog("error", "Got a report for $jid, from $ip, but there's no node like that"); 1191 | } 1192 | archive_job($jid, $job, $opts{status}, $ip, \%opts); 1193 | } else { 1194 | xlog("debug", "Bad job file $conf{spool}/jobs/$jid:$ip.run"); 1195 | } 1196 | } else { 1197 | my $jhistfile=jhistpath($jid); 1198 | if (-e $jhistfile) { 1199 | # repeated acks are ok 1200 | xlog("debug", "Got a duplicate report from $conf{master} for job $jid ($jhistfile)"); 1201 | } else { 1202 | xlog("error", "Got a report for an unknown job $jid, from $ip, status: $opts{status}"); 1203 | } 1204 | } 1205 | sendcmd_nowait($ip, $opts{replyport}?$opts{replyport}:$conf{port}, 'dack', jid=>$jid); 1206 | } else { 1207 | xlog("error", "Got a report for a job $jid with no status info ($src_data)"); 1208 | } 1209 | # return ack even if not exists 1210 | } 1211 | } elsif ($cmd eq 'dack') { 1212 | # ackknowledge receipt of status signal, so you don't have to do it again 1213 | my %dack = @args; 1214 | xlog("debug", "Got dack for $dack{jid}") if $trace; 1215 | if ($dack{jid}) { 1216 | if (!$dack{jrun}) { 1217 | if (!$io_wait{$dack{jid}} || !$io_wait{$dack{jid}}->{streams}) { 1218 | exec_clean($dack{jid}); 1219 | } else { 1220 | xlog("debug", "Still waiting on i/o: " . packdump($io_wait{$dack{jid}}) . ", will clean later") if $trace; 1221 | # ready to clean up 1222 | burp("$conf{spool}/jstat/$dack{jid}.dack",1); 1223 | } 1224 | } 1225 | } 1226 | } elsif ($cmd eq 'jexok') { 1227 | my ($jex) = @args; 1228 | my $jid = $jex->{id}; 1229 | xlog("debug", "Writing $jid.ok") if $trace; 1230 | touch("$conf{spool}/jobs/$jid:$ip.run"); 1231 | burp("$conf{spool}/jobs/$jid.ok",packfile($jex)); 1232 | } elsif ($cmd eq 'xexec') { 1233 | my ($opts) = @args; 1234 | execute_job($opts); 1235 | } elsif ($cmd eq 'xclean') { 1236 | # cleanup iowait stuff.. i got everything 1237 | my ($jid) = @args; 1238 | xlog("debug", "Client is finished with $jid\n"); 1239 | delete $io_wait{$jid}; 1240 | delete $start_wait{$jid}; 1241 | if (-e "$conf{spool}/jstat/$jid.dack") { 1242 | exec_clean($jid); 1243 | } 1244 | } elsif ($cmd eq 'xabort') { 1245 | # kill job 1246 | my ($jid, $sig, $termio) = @args; 1247 | 1248 | if ($io_wait{$jid} && $io_wait{$jid}->{streams}) { 1249 | xlog("debug", "Alerting streams that $jid is dead\n");; 1250 | for(values %{$io_wait{$jid}->{streams}}) { 1251 | replymsg($_, "quit"); 1252 | } 1253 | delete $io_wait{$jid}->{streams}; 1254 | touch("$conf{spool}/jstat/$jid.dumped"); 1255 | } 1256 | 1257 | my $job = unpack_file("$conf{spool}/jpids/$jid"); 1258 | xlog("debug", "Found job $jid with pid $job->{pid}\n");; 1259 | if ((my $pid = $job->{pid}) > 1) { 1260 | $sig = 2 if $sig !~ /^\d+$/; # force sig to 2 if not a number 1261 | # kill the whole shebang 1262 | my $ok = kill(-$sig, $pid); 1263 | xlog("note", "Kill ($sig) job $jid (pgrp:$pid), return: $ok"); 1264 | 1265 | if ($ok) { 1266 | # report status as "killed" 1267 | selfcmd("sstat", {id=>$jid,status=>127+$sig,error=>"Job $jid aborted",dumped=>1}); 1268 | return "Job $jid aborted"; 1269 | } else { 1270 | return "Job $jid kill $sig failed : $!"; 1271 | } 1272 | if ($io_wait{$jid} && $io_wait{$jid}->{zid}) { 1273 | # tell waiters it's dead, and streams are dumped 1274 | replymsg($io_wait{$jid}->{zid},"$jid:stat",{status=>127+$sig, error=>"Job $jid aborted", dumped=>1}); 1275 | } 1276 | } else { 1277 | return "Job $jid not found for xabort"; 1278 | } 1279 | } elsif ($cmd eq 'xio') { 1280 | my ($jid) = @args; 1281 | my $ready = 0; 1282 | my $known = 0; 1283 | if ($io_wait{$jid} && $io_wait{$jid}->{streams}) { 1284 | # definitely not dumped 1285 | $ready = 1; 1286 | for(values %{$io_wait{$jid}->{streams}}) { 1287 | replymsg($_, "ready"); 1288 | } 1289 | delete $io_wait{$jid}->{streams}; 1290 | $known = 1; 1291 | } 1292 | 1293 | # set wait flag, if i/o wasn't dumped 1294 | my $stat=getjobstathash($jid); 1295 | 1296 | # streams are ready, or job isn't done, or io wasn't dumped 1297 | if ($ready||!$stat||!$stat->{dumped}) { 1298 | my $end_time; 1299 | # job is done 1300 | if ($stat && $stat->{rtime}) { 1301 | $end_time = (stat("$conf{spool}/jstat/$jid.stat"))[9]+$stat->{rtime}; 1302 | } 1303 | # finished a while ago 1304 | if ($stat && ($end_time < (time()-(5*$conf{ping_secs}))) ) { 1305 | xlog("error", "Dumping i/o for completed job $jid because streamer hasn't responded in $def{ping_secs} secs, end-time: $end_time"); 1306 | touch("$conf{spool}/jstat/$jid.dumped"); 1307 | $stat->{dumped}=1; 1308 | } else { 1309 | xlog("debug", "Creating io_wait hash entry for $jid, (E:$end_time, $stat)") unless $io_wait{$jid}->{zid} eq $zid; 1310 | $io_wait{$jid}->{type} = 'io'; 1311 | $io_wait{$jid}->{zid} = $zid; 1312 | $io_wait{$jid}->{time} = time(); # toss this entry if it gets oldi 1313 | $known = 1; 1314 | } 1315 | } 1316 | 1317 | if ($stat) { 1318 | if ($stat->{dumped} && $ready ) { 1319 | xlog("error", "Dumped stream $jid before ready received"); 1320 | } 1321 | xlog("debug", "Returning $jid:stat for $jid"); 1322 | return("$jid:stat", $stat); 1323 | } 1324 | if (!$known) { 1325 | # unknown... no iowait, no stat 1326 | xlog("debug", "Returning $jid:unk for $jid, no io_wait set"); 1327 | return("$jid:unk"); 1328 | } else { 1329 | # iowait ready 1330 | return() 1331 | } 1332 | } elsif ($cmd eq 'xstat') { 1333 | my ($jid) = @args; 1334 | 1335 | if ($io_wait{$jid} && $io_wait{$jid}->{streams}) { 1336 | for(values %{$io_wait{$jid}->{streams}}) { 1337 | replymsg($_, "quit"); 1338 | } 1339 | delete $io_wait{$jid}->{streams}; 1340 | } 1341 | 1342 | my $stat=getjobstathash($jid); 1343 | return("$jid:stat", $stat) if $stat; 1344 | 1345 | # only set wait flag if not found 1346 | xlog("debug", "Creating io_wait hash 'xstat' entry for $jid"); 1347 | 1348 | $io_wait{$jid}->{type} = 'stat'; 1349 | $io_wait{$jid}->{zid} = $zid; 1350 | $io_wait{$jid}->{time} = time(); # toss this entry if it gets old 1351 | 1352 | return (); # wait for response 1353 | } elsif ($cmd eq 'jinfo') { # tell me what host a job is on 1354 | my ($jid) = @args; 1355 | if (!$jid) { 1356 | xlog("error", "Job '$jid' does not exist from $ip"); 1357 | return(error=>"Job '$jid' does not exist from $ip"); 1358 | } 1359 | if (! -e "$conf{spool}/jobs/$jid") { 1360 | if (! -e "$conf{spool}/jobs/$jid.ip") { 1361 | my $jhist=jhiststat($jid); 1362 | if (! $jhist) { 1363 | xlog("error", "Job '$jid' does not exist from $ip") if $jid; 1364 | return (error=>"Job '$jid' does not exist during jinfo."); 1365 | } else { 1366 | return(%$jhist); 1367 | } 1368 | } else { 1369 | my $job_ip = slurp("$conf{spool}/jobs/$jid.ip"); 1370 | if (!$job_ip) { 1371 | xlog("error", "No ip for job $jid"); 1372 | return (); 1373 | } else { 1374 | # route to correct node 1375 | if (!$nodes{$job_ip}) { 1376 | my $jhist=jhiststat($jid); 1377 | if (!$jhist) { 1378 | xlog("error", "Job $jid is linked to $job_ip which is not responding"); 1379 | return (warn=>"Job $jid is linked to $job_ip which is not responding"); 1380 | } else { 1381 | return(%$jhist); 1382 | } 1383 | } else { 1384 | return (jid=>$jid, ip=>$job_ip, port=>$nodes{$job_ip}->{port}, hostname=>$nodes{$job_ip}->{hostname}); 1385 | } 1386 | } 1387 | } 1388 | } else { 1389 | # wait for job start 1390 | $start_wait{$jid}->{zid} = $zid; # set router id 1391 | $start_wait{$jid}->{time} = time(); # refresh timestamp 1392 | return (); # wait for response 1393 | } 1394 | } elsif ($cmd eq 'run') { 1395 | # user command, returns jid=>jobid [, error=>string] 1396 | if (!$conf{services}->{queue}) { 1397 | return (error=>"No queue service running on this host"); 1398 | } else { 1399 | my $time = time(); 1400 | ++$gjobid; 1401 | burp("$conf{spool}/nextid", $gjobid); 1402 | 1403 | # the job file 1404 | my $jid = $gjobid; 1405 | my $file = "$conf{spool}/jobs/$jid"; 1406 | my $job = $args[0]; 1407 | my $gfile = "$conf{spool}/guids/$job->{guid}"; 1408 | 1409 | # stick the ip the job came from in the options 1410 | $job->{time}=time(); 1411 | $job->{fromip}=$ip; 1412 | $job->{trace}=$trace; 1413 | 1414 | if ( -e $gfile) { 1415 | return (jid=>slurp($gfile), already=>1); 1416 | } 1417 | xlog("debug", "Created $file\n") if $trace; 1418 | open(G, ">$gfile") || return('error'=>"Can't create $gfile : $!"); 1419 | open(F, ">$file") || return('error'=>"Can't create $file : $!"); 1420 | print F packfile($job); 1421 | close F; 1422 | 1423 | print G $jid; 1424 | close G; 1425 | return (jid=>$jid); 1426 | } 1427 | } elsif ($cmd eq 'sstat') { 1428 | my ($stat) = @args; 1429 | notifystat($stat); 1430 | } elsif ($cmd eq 'sready') { 1431 | my ($key) = @args; 1432 | my ($jid) = $key =~ /^(\d+)/; 1433 | if ($io_wait{$jid} && $io_wait{$jid}->{zid}) { 1434 | if (!($io_wait{$jid}->{type} eq 'io')) { 1435 | return('quit'); 1436 | } else { 1437 | return('ready'); 1438 | } 1439 | } 1440 | if (! -e "$conf{spool}/jpids/$jid" ) { 1441 | xlog("debug", "Dumping stream, $jid is dead\n"); 1442 | delete $io_wait{$jid}; 1443 | return('quit'); 1444 | } 1445 | if ( -s "$conf{spool}/jstat/$jid.stat" ) { 1446 | if ( $conf{loop_num} > 4 ) { 1447 | # any time you check mod times or abandon things, remember to ensure your loop num is more than some small number 1448 | if (fmodtime("$conf{spool}/jstat/$jid.stat"){streams}->{$key}=$zid; 1457 | $io_wait{$jid}->{time}=time(); 1458 | return (); 1459 | } elsif ($cmd eq 'stream') { 1460 | my ($key, $data) = @args; 1461 | my ($jid) = ($key =~ /^(\d+)/); 1462 | if ($io_wait{$jid} && $io_wait{$jid}->{type} eq 'io' && $io_wait{$jid}->{zid}) { 1463 | replymsg($io_wait{$jid}->{zid},$key, $data); 1464 | } else { 1465 | xlog("debug", "Dumping stream $key, no wait $jid\n") if $trace; 1466 | if ($data !~ /:end/) { 1467 | burp("$conf{spool}/jstat/$jid.dumped",1) if length($data)>0; 1468 | } else { 1469 | touch("$conf{spool}/jstat/$jid.dumped"); 1470 | } 1471 | } 1472 | return (); 1473 | } else { 1474 | return ('error'=>"Unknown command $cmd ($src_data)"); 1475 | } 1476 | 1477 | return (); 1478 | } 1479 | 1480 | sub jhistpath { 1481 | my ($id) = @_; 1482 | my $left = int($id/10000); 1483 | my $right = $id; 1484 | my $dir = "$conf{spool}/jhist/$left"; 1485 | mkdir($dir) if ! -d $dir; 1486 | return "$dir/$right"; 1487 | } 1488 | 1489 | sub child_exit { 1490 | my ($kid, $status) = @_; 1491 | if ($pid_jobs{$kid}) { 1492 | my $jid=$pid_jobs{$kid}->{jid}; 1493 | if ($jid) { 1494 | if (-s "$conf{spool}/jstat/$jid.stat") { 1495 | notifystat(unpack_file("$conf{spool}/jstat/$jid.stat"), 1); 1496 | touch("$conf{spool}/jpids/$jid") if -e "$conf{spool}/jpids/$jid"; 1497 | } 1498 | } 1499 | delete $pid_jobs{$kid}; 1500 | } 1501 | } 1502 | 1503 | sub schedule { 1504 | my $did=0; 1505 | 1506 | return unless %nodes; 1507 | 1508 | while ((my $kid = waitpid(-1, WNOHANG))>1) { 1509 | $did=1; 1510 | if ($conf{services}->{exec}) { 1511 | child_exit($kid, $?); 1512 | } 1513 | } 1514 | 1515 | my $tcpu; 1516 | for my $n (values %nodes) { 1517 | if (defined $n->{a_cpus}) { 1518 | $n->{a_cpus} = $n->{a_memory} = 0; 1519 | } 1520 | $tcpu+=$n->{cpus}; 1521 | } 1522 | 1523 | # pass 1 : deduct resources for running jobs 1524 | opendir(D,"$conf{spool}/jobs"); 1525 | my @D = sort {(.5-rand()) cmp (.5-rand())} readdir(D); 1526 | closedir(D); 1527 | 1528 | if (($conf{loop_num}%100)==5) { 1529 | # ocassionally check for expiration of start_waits, though it should never happen 1530 | for my $jid (keys(%start_wait)) { 1531 | if ($conf{expire_secs} && (time() > $start_wait{$jid}->{time}+$conf{expire_secs})) { 1532 | my $jhistfile=jhistpath($jid); 1533 | if ( -e $jhistfile ) { 1534 | $did=1; 1535 | my $job=unpack_file($jhistfile); 1536 | # possibly killed during messaging? in general this shouldn't happen anymore, so it's an error 1537 | xlog("error", "Job $jid, reply to jinfo after archive"); 1538 | replymsg($start_wait{$jid}->{zid},jid=>$jid,status=>$job->{status}, error=>$job->{error}, hostname=>$job->{host}, ip=>"n/a"); 1539 | } 1540 | } 1541 | } 1542 | } 1543 | 1544 | my $jcnt=0; 1545 | my $jcpu=0; 1546 | my $jnee=0; 1547 | for my $jrun (@D) { 1548 | ++$jnee unless ($jrun =~ /\.ip$/); 1549 | next unless ($jrun =~ /\.run$/); 1550 | --$jnee; 1551 | 1552 | my $job=read_job($jrun); 1553 | my $job_ip=$1 if $jrun=~/\:(\S+)\.run$/; 1554 | 1555 | # this should be config, min requirements 1556 | $job->{cpus} = 1 if $job->{cpus} == 0; 1557 | $job->{memory} = ($conf{default_memory}*1000) if $job->{memory} == 0; 1558 | for ((@metrics, @labels)) { 1559 | $job->{$_} = $conf{"default-$_"} if !$job->{$_} && $conf{"default-$_"}; 1560 | } 1561 | 1562 | # job is running, make sure it's resources are somewhat locked 1563 | my ($jid) = $jrun =~ m/^(\d+)/; 1564 | 1565 | # check to see whether job was ever started 1566 | if ($conf{loop_num}>5 && (fmodtime($job->{file}) < (time()-$conf{ping_secs})) && ! -e "$conf{spool}/jobs/$jid.ok") { 1567 | rename("$conf{spool}/jobs/$jrun","$conf{spool}/jobs/$jid"); 1568 | delete $nodes{$job_ip}; 1569 | xlog("error", "No execution confirm. Deregister $job_ip as bad, put job $jid back in the queue"); 1570 | } 1571 | # check to see whether job has expired 1572 | if ($conf{loop_num}>5 && $conf{expire_secs} && (fmodtime($job->{file}) < (time()-$conf{expire_secs}))) { 1573 | $did=1; 1574 | selfcmd('jstat', {pid=>-1, id=>$jid, status => $STATUS_ORPHAN, error=>"Orphaned job (" . (time()-fmodtime($job->{file})) . "s)"}); 1575 | } else { 1576 | $job_ip = slurp("$conf{spool}/jobs/$jid.ip") unless $job_ip; 1577 | if ($job_ip =~ /\d/ && $nodes{$job_ip} && $nodes{$job_ip}->{ip}) { 1578 | ++$jcnt; 1579 | $jcpu+=$job->{cpus}; 1580 | $nodes{$job_ip}->{a_cpus} += $job->{cpus}; 1581 | $nodes{$job_ip}->{a_memory} += $job->{memory}; 1582 | for (@metrics) { 1583 | $nodes{$job_ip}->{"a_$_"} += $job->{param}->{$_}; 1584 | } 1585 | } 1586 | } 1587 | } 1588 | 1589 | 1590 | my @nlist = values %nodes; 1591 | # no attempt here to prioritize jobs, just match and go 1592 | my $cnt=0; 1593 | my $full = ($jcpu/$tcpu); 1594 | 1595 | for my $jid (@D) { 1596 | next unless ($jid =~ /^\d+$/o); 1597 | 1598 | my $job = read_job($jid); 1599 | 1600 | next if $job->{state} eq 'hold'; 1601 | 1602 | $job->{priority} = $conf{default_priority} if !$job->{priority}; 1603 | next if ($job->{priority} < 20) && ($job->{priority}/5 < (sqrt(rand())*$full)); 1604 | 1605 | $full+=($job->{priority}*$job->{cpus})/($tcpu*5); 1606 | 1607 | # this should be config, min requirements 1608 | $job->{cpus} = 1 if $job->{cpus} == 0; 1609 | $job->{memory} = ($conf{default_memory}*1000) if $job->{memory} == 0; 1610 | 1611 | my @dereg; 1612 | my @n; 1613 | my ($max_av, $max_n); 1614 | 1615 | if ($cnt >= $conf{max_sched}) { 1616 | last; 1617 | } 1618 | if (!@nlist) { 1619 | xlog("debug", "No available nodes, not scheduling"); 1620 | last; 1621 | } 1622 | 1623 | ++$cnt; 1624 | my $spread = rand() > $conf{spread_pct}/100; 1625 | 1626 | for (my $i = 0; $i < @nlist; ++$i) { 1627 | my $n = $nlist[$i]; 1628 | 1629 | # jobs need enough memory, cpu availability and disk space... that's it 1630 | if (!$n->{ip}) { 1631 | xlog("error", "Node has no ip! " . packdump($n)); 1632 | next; 1633 | } 1634 | my $cpus = $n->{cpus} - $n->{a_cpus}; 1635 | $cpus = $n->{avail} if $n->{avail} < $cpus; 1636 | 1637 | my $mem = $n->{tmem} - $n->{a_memory}; 1638 | $mem = $n->{mem} if ($n->{tmem} == 0) || ($n->{mem} < $mem); 1639 | 1640 | # See below, only log reason 1641 | # if (debugging()) { 1642 | # xlog("debug", "Sched $jid: $n->{ip}: jcpu:$job->{cpus}, norig: $n->{orig_cpus}, ncpu:$n->{cpus}, nall:$n->{a_cpus}, nav:$n->{avail}, cpus:$cpus , jmem:$job->{memory}, nmem:$n->{mem}, avmem:$mem"); 1643 | # } 1644 | 1645 | if ($cpus <= 0) { 1646 | # pull the node out of the list 1647 | xlog("debug", "Removing $n->{hostname} from node list, cpus are $cpus"); 1648 | splice(@nlist, $i, 1); 1649 | --$i; 1650 | } 1651 | # did it ping recently? 1652 | if ($n->{ping} > (time()-$conf{ping_secs}*6)) { 1653 | my $ok = 1; 1654 | for (@metrics) { 1655 | if (($n->{"param-$_"} - $n->{"a_$_"}) < $job->{param}->{$_}) { 1656 | $ok =0; 1657 | } 1658 | } 1659 | for (@labels) { 1660 | if (!($n->{"param-$_"} eq $job->{param}->{$_})) { 1661 | $ok =0; 1662 | } 1663 | } 1664 | 1665 | if ( ($mem >= $job->{memory}) && 1666 | (($cpus+$conf{idle_load}) >= $job->{cpus}) && 1667 | ($n->{disk} >= $job->{disk}) && 1668 | ($ok) 1669 | ) { 1670 | next if $job->{hosts} && $job->{hosts} !~ /$n->{hostname}/; 1671 | 1672 | my $match = 1; 1673 | if ($conf{match}) { 1674 | $match = evalctx($conf{match}, node=>$n, job=>$job); # eval perl expression 1675 | if ($@) { 1676 | $match = 1; # permit all on error? 1677 | } 1678 | } 1679 | next unless $match; 1680 | 1681 | if ($spread) { 1682 | # use *least* busy node 1683 | if ($n->{load} < $conf{idle_load}) { 1684 | # don't bother checking further, this node is bored 1685 | $max_n = $n; 1686 | last; 1687 | } else { 1688 | if ($n->{avail} > $max_av) { 1689 | $max_n = $n; 1690 | $max_av = $n->{avail}; 1691 | } 1692 | } 1693 | } else { 1694 | # use *first* node 1695 | $max_n = $n; 1696 | $max_av = $n->{avail}; 1697 | last; 1698 | } 1699 | } else { 1700 | my $reason = ""; 1701 | if (!($mem >= $job->{memory})) { 1702 | $reason = "memory ($mem)"; 1703 | } elsif (!(($cpus+$conf{idle_load}) >= $job->{cpus})) { 1704 | $reason = "cpus ($cpus)"; 1705 | } elsif (!(($n->{disk} >= $job->{disk}))) { 1706 | $reason = "disk ($n->{disk})"; 1707 | } else { 1708 | for (@metrics) { 1709 | if (($n->{"param-$_"} - $n->{"a_$_"}) < $job->{param}->{$_}) { 1710 | $reason = "$_ (" . $n->{"a_$_"} . ")"; 1711 | } 1712 | } 1713 | } 1714 | xlog("debug", "Sched $jid: $n->{ip}: $reason"); 1715 | } 1716 | } else { 1717 | push @dereg, $n->{ip} if $n->{ip}; 1718 | } 1719 | } 1720 | for my $ip (@dereg) { 1721 | xlog("note", "Deregister node '$ip', last ping was " . (time()-$nodes{$ip}->{ping}) . " seconds ago"); 1722 | delete $nodes{$ip}; 1723 | $did=1; 1724 | } 1725 | if ($max_n) { 1726 | $did=1; 1727 | xlog("debug", "Matched '$max_n->{ip}' to job $job->{file}") if $job->{trace}; 1728 | 1729 | # todo... change this... it's an ugly way of owning jobs 1730 | my $jmine = "$job->{file}:" . $max_n->{ip} . ".run"; 1731 | touch($job->{file}) if -e $job->{file}; 1732 | rename($job->{file}, $jmine); 1733 | if ( -e $jmine ) { 1734 | my $jptr = "$job->{file}" . ".ip"; 1735 | burp($jptr, $max_n->{ip}); 1736 | noderun($max_n, $jid, $job); 1737 | 1738 | # TODO: handle metrics universally, allocated, total, and current 1739 | 1740 | $max_n->{a_cpus} += $job->{cpus}; # allocate 1741 | $max_n->{a_memory} += $job->{memory}; 1742 | for (@metrics) { 1743 | $max_n->{"a_$_"} += $job->{param}->{$_}; 1744 | } 1745 | $max_n->{avail} -= $job->{cpus}; # assume being used 1746 | $max_n->{mem} -= $job->{memory}; # assume being used 1747 | ++$jcnt; 1748 | } else { 1749 | xlog("error", "Rename failed for $jmine\n"); 1750 | } 1751 | } else { 1752 | if ($conf{backup_grid}) { 1753 | # TODO: fork exec to backup grid, and add to list of pids to track... as if you're an exec node 1754 | } 1755 | xlog("debug", "Can't find node for $jid, $jcnt jobs running did=$did\n") if $conf{trace}; 1756 | } 1757 | } 1758 | 1759 | # kickstart nodes, if needed 1760 | $did|=kicknodes(); 1761 | 1762 | return $did; 1763 | } 1764 | 1765 | sub read_job { 1766 | my ($jid)=@_; 1767 | my $jfil = "$conf{spool}/jobs/$jid"; 1768 | next unless -f $jfil; 1769 | my $ref = unpack_file($jfil); 1770 | if (!(ref($ref) eq 'HASH')) { 1771 | xlog("error", "Invalid job file format ($ref): $jfil -> $conf{spool}/trash/$jid\n"); 1772 | rename($jfil, "$conf{spool}/trash/$jid"); 1773 | } 1774 | $ref->{file}=$jfil; 1775 | return $ref; 1776 | } 1777 | 1778 | sub noderun { 1779 | my ($n, $jid, $job) = @_; 1780 | # send 'exec' 1781 | $job->{port} = $conf{port}; # reply to me on this port 1782 | $job->{id} = $jid; 1783 | if ($start_wait{$jid}) { 1784 | # info needed for status/stdio collection from execution node 1785 | replymsg($start_wait{$jid}->{zid},jid=>$jid, ip=>$n->{ip}, port=>$n->{port}, hostname=>$n->{hostname}); 1786 | delete $start_wait{$jid}; 1787 | } 1788 | sendcmd($n->{ip},$n->{port},'xexec', $job); 1789 | } 1790 | 1791 | # called at start, and kill -HUP 1792 | sub init { 1793 | $ENV{HOSTNAME} = `hostname`; 1794 | chomp $ENV{HOSTNAME}; 1795 | readconf(); 1796 | $conf{version}=$VERSION; 1797 | if ($daemon) { 1798 | mkdir $conf{spool}; 1799 | mkdir "$conf{spool}/jobs"; 1800 | mkdir "$conf{spool}/jstat"; 1801 | mkdir "$conf{spool}/jhist"; 1802 | mkdir "$conf{spool}/nodes"; 1803 | mkdir "$conf{spool}/pids"; 1804 | mkdir "$conf{spool}/jpids"; 1805 | mkdir "$conf{spool}/trash"; 1806 | mkdir "$conf{spool}/guids"; 1807 | # reregister on reread 1808 | delete $conf{node}; 1809 | } 1810 | 1811 | $conf{hostip} = host2ip($conf{hostname}) unless $conf{hostip}; 1812 | 1813 | if (!$conf{hostip}) { 1814 | die "Can't start server without knowing ip. $conf{hostname} does not resolve to ip, and no hostip defined\n"; 1815 | } 1816 | } 1817 | 1818 | sub getmem { 1819 | my ($cache, $free, $tot); 1820 | open F, "/proc/meminfo"; 1821 | while () { 1822 | $tot = $1 if /MemTotal:\s*(\d+)/i; 1823 | $free = $1 if /MemFree:\s*(\d+)/i; 1824 | $cache = $1 if /Cached:\s*(\d+)/i; 1825 | last if $cache & $free; 1826 | } 1827 | close F; 1828 | return ($tot, $cache + $free); 1829 | } 1830 | 1831 | sub getcpus { 1832 | my $cores; 1833 | open F, "/proc/cpuinfo"; 1834 | my %cores; 1835 | while () { 1836 | $cores{$1}=1 if /processor\s*:\s*(\d+)/i; 1837 | } 1838 | close F; 1839 | $cores = scalar keys %cores if %cores; 1840 | return $cores; 1841 | } 1842 | 1843 | sub getbench { 1844 | my ($force)=@_; 1845 | my $bench = slurp("$conf{spool}/bench"); 1846 | if (!$bench||$force||(fmodtime("$conf{spool}/bench")<(time()-$conf{bench_secs}))) { 1847 | my $s = Time::HiRes::time(); 1848 | my $i=0; 1849 | while (Time::HiRes::time() < $s+3) { 1850 | my %d = (1..10000); 1851 | map {$d{$_}*=3.333} keys(%d); 1852 | map {$d{$_}/=2.222} keys(%d); 1853 | ++$i; 1854 | } 1855 | my $e = Time::HiRes::time(); 1856 | $bench=$i/($e-$s); 1857 | burp("$conf{spool}/bench",$bench); 1858 | } 1859 | return $bench; 1860 | } 1861 | 1862 | sub slurp 1863 | { 1864 | my $dat; 1865 | my $in = new IO::File; 1866 | return undef unless open($in, $_[0]); 1867 | local $/ = undef; 1868 | $dat = $in->getline; 1869 | $in->close; 1870 | close($in); 1871 | return $dat; 1872 | } 1873 | 1874 | sub srvexec { 1875 | my $did=0; 1876 | 1877 | # assure we can't flood on misconfig 1878 | $conf{ping_secs} = 5 if $conf{ping_secs} == 0; 1879 | 1880 | if ($conf{services}->{exec} && (!$conf{node} || (time() > ($conf{node}->{ping}+$conf{ping_secs}-1)))) { 1881 | # ping master with stats 1882 | $conf{node}->{arch} = `arch`; chomp $conf{node}->{arch}; 1883 | ($conf{node}->{tmem},$conf{node}->{mem}) = getmem(); # free mem 1884 | $conf{node}->{load} = slurp("/proc/loadavg"); # load 1885 | $conf{node}->{orig_cpus} = getcpus(); 1886 | $conf{node}->{cpus} = $conf{cpus} ? $conf{cpus} : $conf{node}->{orig_cpus}; # num cores 1887 | $conf{node}->{bench} = $conf{bench} ? $conf{bench} : getbench(); # num cores 1888 | $conf{node}->{avail} = min($conf{node}->{cpus}, $conf{node}->{orig_cpus} - $conf{node}->{load}); 1889 | $conf{node}->{ping} = time(); 1890 | $conf{node}->{port} = $conf{port}; 1891 | $conf{node}->{ip} = $conf{hostip}; 1892 | $conf{node}->{hostname} = $conf{hostname}; 1893 | $conf{node}->{kernel} = `uname -rv`; chomp $conf{node}->{kernel}; 1894 | $conf{node}->{arch} = `uname -m`; chomp $conf{node}->{arch}; 1895 | 1896 | for ((@labels,@metrics)) { 1897 | $conf{node}->{"param-$_"} = $conf{"param-$_"}; 1898 | } 1899 | 1900 | $did=1; 1901 | $conf{registered} = 1; 1902 | if (!sendcmd($conf{master}, $conf{master_port}, 'node', $conf{node})) { 1903 | $conf{registered} = 0; 1904 | } 1905 | } 1906 | 1907 | while ((my $kid = waitpid(-1, WNOHANG))>1) { 1908 | $did=1; 1909 | child_exit($kid, $?); 1910 | } 1911 | 1912 | if (time() > ($conf{lastpidtime}+$conf{ping_secs})) { 1913 | # check for expiration of io_wait 1914 | if ($conf{loop_num}>5) { 1915 | for (keys(%io_wait)) { 1916 | if ($conf{expire_secs} && (time() > $io_wait{$_}->{time}+$conf{expire_secs})) { 1917 | delete $io_wait{$_}; 1918 | } 1919 | } 1920 | } 1921 | 1922 | $did=1; 1923 | opendir(D,"$conf{spool}/jpids") or die "Can't open jpids\n"; 1924 | 1925 | my $mjob; 1926 | my $oksusp; 1927 | while(my $jid = readdir(D)) { 1928 | next unless $jid =~ /^\d/; 1929 | next unless fmodtime("$conf{spool}/jpids/$jid") < time()-$conf{ping_secs}; 1930 | if (-s "$conf{spool}/jstat/$jid.stat") { 1931 | notifystat(unpack_file("$conf{spool}/jstat/$jid.stat"), 1); 1932 | next; 1933 | } 1934 | # been a while... check to see if it's alive 1935 | my $job = unpack_file("$conf{spool}/jpids/$jid"); 1936 | my $pid = $job->{pid}; 1937 | 1938 | # there could be a fake pid for a jobs that "ran" but for whatever reason, never started 1939 | 1940 | next unless $pid =~ /^\d+$/; 1941 | 1942 | if ($conf{node}->{avail} < -($conf{node}->{cpus}/2)) { 1943 | if (!$mjob && ($job->{priority} < $mjob->{priority})) { 1944 | $mjob = $job; 1945 | } 1946 | ++$oksusp; 1947 | } 1948 | 1949 | if ($conf{node}->{avail} > 0) { 1950 | if ( -e "$conf{spool}/jstat/$job->{id}.held" ) { 1951 | kill(-18, $pid); 1952 | xlog("note", "Resuming $job->{id}, because node is available"); 1953 | sendcmd($conf{master},$conf{master_port}, 'jedit', $mjob->{id}, state=>''); 1954 | unlink("$conf{spool}/jstat/$job->{id}.held"); 1955 | } 1956 | } 1957 | 1958 | # wait for pids 1959 | my $alive = kill(0, $pid); 1960 | if ($alive) { 1961 | $io_wait{$jid}->{time}=time() if ($io_wait{$jid}); 1962 | notifystat({id=>$jid, jrun=>1}); 1963 | touch("$conf{spool}/jpids/$jid") if -e "$conf{spool}/jpids/$jid"; 1964 | } else { 1965 | notifystat({id=>$jid, status=>$STATUS_ORPHAN, error=>"Unknown $jid exit code, really bad!", dumped=>1}); 1966 | } 1967 | } 1968 | closedir D; 1969 | 1970 | if ($mjob && $oksusp > 1) { 1971 | kill(-19, $mjob->{pid}); 1972 | touch("$conf{spool}/jstat/$mjob->{id}.held"); 1973 | xlog("note", "Suspending $mjob->{id}, because node is busy"); 1974 | sendcmd($conf{master},$conf{master_port}, 'jedit', $mjob->{id}, state=>'susp'); 1975 | } 1976 | 1977 | $conf{lastpidtime} = time(); 1978 | } 1979 | } 1980 | 1981 | sub touch { 1982 | my $nowisthe=time(); 1983 | return utime($nowisthe, $nowisthe, @_); 1984 | } 1985 | 1986 | sub fmodtime { 1987 | return (stat($_[0]))[9]; 1988 | } 1989 | 1990 | sub notifystat { 1991 | my ($stat, $from_jstat, $nowait) = @_; 1992 | 1993 | confess("stat is required\n") if (!$stat); 1994 | 1995 | if (!$stat->{jrun}) { 1996 | if ($io_wait{$stat->{id}} && $io_wait{$stat->{id}}->{zid}) { 1997 | # tell the client that the job is done 1998 | replymsg($io_wait{$stat->{id}}->{zid},"$stat->{id}:stat",$stat); 1999 | } else { 2000 | # it's not a request to notify that came from the file 2001 | # and yet, the file is there, with, presumably, valid status info 2002 | # so which do we report? 2003 | 2004 | # I assume, report what's in the file....ie: it was there first 2005 | # this can (rarely) happen if you kill a job after it completes successfully, for example 2006 | 2007 | # TODO: the safer thing is to report the "worst case" ... ie: if either is an error, report error 2008 | # and if both agree ... then don't log anything here... not a problem 2009 | 2010 | if ( ! $from_jstat && -s "$conf{spool}/jstat/$stat->{id}.stat") { 2011 | # log the problem for inspection 2012 | xlog("error", "Got alternative status for $stat->{id}, this may not be correct!"); 2013 | # save the new status as an error file to inspect later 2014 | burp("$conf{spool}/jstat/$stat->{id}.stat-err", packfile($stat)); 2015 | } 2016 | } 2017 | } 2018 | 2019 | # already dack'ed 2020 | if ( -e "$conf{spool}/jstat/$stat->{id}.dack" ) { 2021 | xlog("debug", "Not notifying status for $stat->{id}, dack already set"); 2022 | exec_clean($stat->{id}); 2023 | return; 2024 | } 2025 | 2026 | xlog("debug", "Notifying status " . packdump($stat) . ".\n") if ($conf{trace} || $stat->{trace}); 2027 | 2028 | # tell main queue about the status change 2029 | $stat->{replyport}=$conf{port}; 2030 | if ($nowait) { 2031 | sendcmd_nowait($conf{master},$conf{master_port}, 'jstat', $stat); 2032 | } else { 2033 | sendcmd($conf{master},$conf{master_port}, 'jstat', $stat); 2034 | } 2035 | } 2036 | 2037 | # unpack a very simple configuration-style file 2038 | sub funpack { 2039 | my ($fil, $dat) = @_; 2040 | return gunpack(slurp($fil), $dat); 2041 | } 2042 | 2043 | sub gunpack { 2044 | my ($msg, $dat) = @_; 2045 | $dat = {} if !$dat; 2046 | for (split(/\n/, $msg)) { 2047 | my ($k, $v) = m/^\s*([^:=]+)?\s*[:=]\s*(.*?)\s*$/; 2048 | $k = lc($k); 2049 | $dat->{$k}=$v; 2050 | } 2051 | return $dat; 2052 | } 2053 | 2054 | # more complex config file support 2055 | # contains logic for turning delimited lists into array configs, etc. 2056 | sub readconf { 2057 | %conf = %def; 2058 | 2059 | _readconf("$conf{config}"); 2060 | 2061 | # defines happen at the end so defaults can get unpacked 2062 | for (keys %conf) { 2063 | next if ref $conf{$_}; 2064 | if ($_ eq 'match' || $_ =~ /^label/) { 2065 | # match rules are evaluated during matching, but reval now just to test 2066 | my $test = $conf{$_}; 2067 | # see http://www.perlmonks.org/?node_id=685699 for why this is OK 2068 | $test =~ s/`[^`]+`/1/g; # turn off backtics 2069 | $test =~ s/system\([^\)]\)/1/g; # turn off system calls 2070 | $safe->reval($test); # check syntax 2071 | if ($@) { 2072 | # report a problem with the rule 2073 | xlog("error", "Error testing match rule : $@"); 2074 | } 2075 | $@=''; 2076 | } elsif ( ! ($conf{$_} =~ s/^\{(.*)\}$/eval($1)/gei) ) { 2077 | # evaluate simple inline vars at configure-time 2078 | if ( $conf{$_} =~ m/\$([\w-]+)\{/) { 2079 | xlog("error", "Error, rule has a hash variable, which requires braces\n"); 2080 | } else { 2081 | $conf{$_} =~ s/\$([\w-]+)/$conf{lc($1)}?$conf{lc($1)}:$1/gei; 2082 | } 2083 | } 2084 | if ($_=~ /^param-(.*)/) { 2085 | my $nm=$1; 2086 | # evaluates to the value for that param... if numeric is a "metric" otherwise is a "label" 2087 | if ($conf{$_} =~ /^[\d.]+$/) { 2088 | push @metrics, $nm; 2089 | } else { 2090 | push @labels, $nm; 2091 | } 2092 | } 2093 | } 2094 | 2095 | # reorganize some conf vars into a hash 2096 | for my $k (qw(services log_types)) { 2097 | my $v; 2098 | $v = $conf{$k}; 2099 | $conf{$k} = {}; 2100 | for (split(/[\s,]+/,$v)) { 2101 | $conf{$k}->{$_} = 1; 2102 | } 2103 | } 2104 | 2105 | # these low-level entries are controlled by trace bits in packets... not by user preference 2106 | $conf{log_types}->{trace}=1; 2107 | 2108 | # stored as an array reference 2109 | $conf{env} = [split(/[\s,]+/,$conf{env})] unless ref($conf{env}) eq 'ARRAY'; 2110 | 2111 | # split up host/port 2112 | $conf{port} = $1 if $conf{bind} =~ s/:(\d+)$//; 2113 | # same for master (if different - has to be if there's a queue/exec on the same box) 2114 | $conf{master_port} = $1 if $conf{master} =~ s/:(\d+)$//; 2115 | $conf{master_port} = $conf{port} if !$conf{master_port}; 2116 | } 2117 | 2118 | # basic config reader, like funpack, but uses the %conf and %def hashes 2119 | sub _readconf { 2120 | my ($f) = @_; 2121 | %conf = %def; 2122 | if (!open(CONF, $f)) { 2123 | xlog("error", "Can't open '$f'"); 2124 | die("Can't open config '$f'\n"); 2125 | } 2126 | while() { 2127 | next if /^\s*#/; 2128 | my ($k, $v) = m/^\s*([^:]+)?\s*:\s*(.*?)\s*$/; 2129 | $k = lc($k); 2130 | if ($k eq 'include') { 2131 | _readconf($v); 2132 | } else { 2133 | $conf{$k} = $v; 2134 | } 2135 | } 2136 | close CONF; 2137 | } 2138 | 2139 | # log stuff. TODO: cache opened handles and test for operation... that way you won't have to reopen so many! 2140 | sub xlog { 2141 | my $m = join("\t", @_); 2142 | my $class = $_[0]; 2143 | return unless ref($conf{log_types}) && $conf{log_types}->{$class}; 2144 | $m =~ s/\n/ /g; 2145 | my $line = scalar(localtime) . "\t" . $m . "\n"; 2146 | my $log = $conf{"log_file"}; 2147 | if ($log && ! ($log eq '-')) { 2148 | open LOG, ">>" . $log; 2149 | print LOG $line; 2150 | close LOG; 2151 | print STDERR $line if $log_to_stderr; 2152 | } else { 2153 | print $line; 2154 | } 2155 | return $line; 2156 | } 2157 | 2158 | # wait for STDERR and STDOUT from a command 2159 | sub waitio { 2160 | my ($host, $port, $jobid, $opt) = @_; 2161 | my @resp; 2162 | my $stat; 2163 | my $err; 2164 | my $diderr; 2165 | 2166 | my $sock = _sendcmd($host, $port, 'xio', $jobid); 2167 | 2168 | my $stat_time; 2169 | $|=1; 2170 | my $needio = !$opt->{err}+!$opt->{out}; 2171 | my $start_wait=time(); 2172 | my $unk; 2173 | while ((!defined $stat) || $needio) { 2174 | my $got = 0; 2175 | # wait up to 5 seconds for output 2176 | zmq_poll([{ 2177 | socket=>$sock, events=>ZMQ_POLLIN, callback=> sub { 2178 | my ($ip, $trace, $key, $dat) = recvmsg($sock); 2179 | if ($ip) { 2180 | $got = 1; 2181 | my ($jid, $type, $cmd) = split /:/, $key; 2182 | if ($type eq 'err') { 2183 | $diderr = 1 if $dat; 2184 | print STDERR $dat if $dat; 2185 | --$needio if $cmd eq 'end'; 2186 | } elsif($type eq 'out') { 2187 | print STDOUT $dat if $dat; 2188 | --$needio if $cmd eq 'end'; 2189 | } elsif($type eq 'stat') { 2190 | $stat = $dat->{status}; 2191 | $err = $dat->{error}; 2192 | $stat_time=time() if !$stat_time; 2193 | 2194 | if (time()>($stat_time+$conf{expire_secs})) { 2195 | # i'm taking charge of dumping it 2196 | xlog("error", "Job $jobid, dumping i/o, and reporting failure... took too long"); 2197 | $dat->{dumped}=1; 2198 | } 2199 | 2200 | if ($dat->{dumped}) { 2201 | # don't wait longer for i/o if the i/o was lost 2202 | # todo... fail here if stat is 0? 2203 | if (!$stat) { 2204 | $stat=37; 2205 | $err="Failing because i/o was dumped"; 2206 | } 2207 | $needio = 0; 2208 | } 2209 | # what does the client do with the times? 2210 | } elsif($type eq 'unk') { 2211 | ++$unk; 2212 | sleep(1); 2213 | } else { 2214 | xlog("error", "Job $jobid, got message ($jid, $type, $cmd) in response to xio"); 2215 | } 2216 | } else { 2217 | xlog("error", "Job $jobid, got message ($key) in response to xio"); 2218 | } 2219 | }}],$conf{retry_secs}*2*1000); 2220 | 2221 | if (!$got) { 2222 | # if you didn't get anything, ask again 2223 | $sock = _sendcmd($host, $port, 'xio', $jobid); 2224 | if (time()>$start_wait+$conf{retry_secs}*7) { 2225 | # been a while...ask head node if this job is dead? 2226 | my %info = waitmsg($conf{master}, $conf{master_port}, 'jinfo', $jobid); 2227 | if ($info{status} =~ /\d/) { 2228 | $stat=$info{status}; 2229 | $err=$info{error}; 2230 | $needio = 0; 2231 | } 2232 | if ($unk > 200) { 2233 | $stat=$STATUS_UNKNOWN; 2234 | $err="Error, job submission failed"; 2235 | $needio = 0; 2236 | } 2237 | # restart wait time 2238 | $start_wait=time(); 2239 | } 2240 | } 2241 | } 2242 | return ($stat, $err, $diderr); 2243 | } 2244 | 2245 | sub waitmsg { 2246 | my $sock = _sendcmd(@_); 2247 | my ($ip, $trace, @msg) = recvmsg($sock); 2248 | return @msg; 2249 | } 2250 | 2251 | # this tries over and over to get a response.... 2252 | # usually this is not needed, but if the outer disappears, the zqm-id will be lost, so 2253 | # this is just for recovery in the event of the router shutting down 2254 | sub waitmsg_retry { 2255 | my $retry = shift @_; 2256 | 2257 | my $got=0; 2258 | my ($ip, $trace, @msg); 2259 | my $sock=_sendcmd(@_); 2260 | while (!$got) { 2261 | zmq_poll([ 2262 | { 2263 | socket=>$sock, events=>ZMQ_POLLIN, callback=> sub { 2264 | $got=1; 2265 | ($ip, $trace, @msg) = recvmsg($sock); 2266 | }}, 2267 | ],$retry); 2268 | 2269 | if (!$got){ 2270 | $sock=_sendcmd(@_); 2271 | } 2272 | } 2273 | return @msg; 2274 | } 2275 | 2276 | sub sendcmd_nowait { 2277 | my ($host, $port, @cmd) = @_; 2278 | return 0 unless my $sock = getsock($host, $port); 2279 | my $xcmd = packcmd(@cmd); 2280 | 2281 | # 1 millisecond wait 2282 | # zmq_poll([ 2283 | # { 2284 | # socket=>$sock, events=>ZMQ_POLLOUT, callback=> sub {} 2285 | # }, 2286 | # ],1000); 2287 | 2288 | if (zmq_send($sock, "", 0, ZMQ_SNDMORE|ZMQ_NOBLOCK)==0) { 2289 | if (zmq_send($sock, $xcmd)==-1) { 2290 | xlog("error","Can't send [@cmd] to $host:$port : $!", Carp::longmess()); 2291 | return 0; 2292 | } 2293 | } else { 2294 | return 0; 2295 | } 2296 | return 1; 2297 | } 2298 | 2299 | sub recvmsg { 2300 | my ($sock) = @_; 2301 | my @ret; 2302 | my ($buf, $dat); 2303 | if (my $msg = zmq_recvmsg($sock)) { 2304 | $msg = zmq_recvmsg($sock); 2305 | if ($msg) { 2306 | my $buf = zmq_msg_data($msg); 2307 | xlog("debug", "Client $$ got response: $buf") if $conf{trace}; 2308 | if ($buf) { 2309 | return unpackcmd($buf); 2310 | } 2311 | } 2312 | } 2313 | return @ret; 2314 | } 2315 | 2316 | 2317 | sub sendcmd { 2318 | my $sock = _sendcmd(@_); 2319 | return $sock ? 1 : undef; 2320 | } 2321 | 2322 | sub packcmd { 2323 | if (!$conf{hostip}) { 2324 | confess("Need a defined hostip"); 2325 | } 2326 | return encode_json([$conf{hostip},$conf{trace},@_]); 2327 | } 2328 | 2329 | sub packref { 2330 | if (!(ref($_[0]) eq 'ARRAY')) { 2331 | croak "All packrefs are arrays"; 2332 | } else { 2333 | return encode_json([$conf{hostip},$conf{trace},@{$_[0]}]); 2334 | } 2335 | } 2336 | 2337 | sub unpackcmd { 2338 | my $ref = eval{decode_json($_[0])}; 2339 | if (!$@ && (ref($ref) eq 'ARRAY')) { 2340 | return @$ref; 2341 | } else { 2342 | return undef; 2343 | } 2344 | } 2345 | 2346 | sub packfile { 2347 | croak unless ref $_[0]; 2348 | if (ref $_[0] eq 'HASH') { 2349 | $_[0]->{version}=$VERSION; 2350 | } 2351 | return encode_json($_[0]); 2352 | } 2353 | 2354 | sub unpack_file { 2355 | my $ref; 2356 | eval { 2357 | $ref=decode_json(slurp($_[0])); 2358 | }; 2359 | carp "$@" if $@; 2360 | return $ref; 2361 | } 2362 | 2363 | ### sends a command to a server/router, returns the socket to wait on 2364 | sub getsock { 2365 | my ($host, $port) = @_; 2366 | my $sock; 2367 | if (!$ZMQS{"tcp://$host:$port"}) { 2368 | $sock = zmq_socket($context, ZMQ_DEALER); 2369 | if (!$daemon) { 2370 | # clients should block if messages are queued, not bang on nonexistant servers 2371 | zmq_setsockopt($sock, ZMQ_HWM, 50); 2372 | xlog("debug", "Set HWM to 50 for $host in pid $$\n"); 2373 | } 2374 | if (!zmq_connect($sock,"tcp://$host:$port")) { 2375 | $ZMQS{"tcp://$host:$port"} = $sock; 2376 | } else { 2377 | croak "Can't connect to tcp://$host:$port: $@\n"; 2378 | } 2379 | } else { 2380 | $sock = $ZMQS{"tcp://$host:$port"}; 2381 | } 2382 | 2383 | if (!$sock) { 2384 | xlog("error",$@="Can't connect to $host:$port", Carp::longmess()); 2385 | return undef; 2386 | } 2387 | return $sock; 2388 | } 2389 | 2390 | sub _sendcmd { 2391 | my ($host, $port, @cmd) = @_; 2392 | return undef unless my $sock = getsock($host, $port); 2393 | my $xcmd = packcmd(@cmd); 2394 | zmq_send($sock, "", 0, ZMQ_SNDMORE); 2395 | if (zmq_send($sock, $xcmd)==-1) { 2396 | xlog("error","Can't send [@cmd] to $host:$port : $!", Carp::longmess()); 2397 | } 2398 | return $sock; 2399 | } 2400 | 2401 | sub burp 2402 | { 2403 | my ($f, $dat) = @_; 2404 | my $h = new IO::File; 2405 | do { 2406 | eval { 2407 | open ($h, ">$f.tmp") || die "$f: $!"; 2408 | print $h $dat; 2409 | close $h; 2410 | rename("$f.tmp", $f) || die "$f: $!"; 2411 | }; 2412 | if ($@) { 2413 | xlog("error",$!); 2414 | } 2415 | } while ($@); 2416 | } 2417 | 2418 | sub usage { 2419 | my $u; 2420 | $u .= <<'EOF' unless $make; 2421 | Usage: grun command... 2422 | or: grun -d [] 2423 | or: grun -k [] 2424 | or: grun -e key=val[,key2=val2...] [] 2425 | or: grun -q [] 2426 | 2427 | Lightweight job queueing system 2428 | 2429 | For more help, run grun -?, grun -d -?, grun -C -? or grun -q -?. 2430 | EOF 2431 | 2432 | $u .= <<'EOF' unless $daemon || $qinfo || $editjob || $make; 2433 | 2434 | Execution Options: 2435 | -f|ile FILE Read FILE for job options (mem, cpu, cmd, etc) 2436 | -m|em INT memory minimum in MB 2437 | -c|pu CPUS minimum number of cpus 2438 | -host N1,N2 specify certain hosts 2439 | -j|obid TEXT user-supplied job ID 2440 | -v|erbose print job id, execution node and stats to STDERR 2441 | -M|make only run job if inputs are newer than outputs 2442 | -r|priority INT run job with only run job if inputs are newer than outputs 2443 | 2444 | -noio disable io-processing, but wait for completion 2445 | -nowait no io and don't wait, just start the command 2446 | -e|rr FILE write stderr directly to FILE, no spool * 2447 | -o|ut FILE write stdout directly to FILE, no spool * 2448 | 2449 | All options can be abbreviated to uniqueness. 2450 | 2451 | * You can append error & output with -oa, -ea or -out-append -err-append, 2452 | or both with -oea, or --out-err-append. 2453 | 2454 | If the command contains shell metacharacters, it's wrapped in a bash script 2455 | EOF 2456 | 2457 | $u .= <<'EOF' if $make; 2458 | Make Semantics: 2459 | 2460 | %i:file Input file, left in the command line 2461 | #i:file Input file, removed the command line before executing 2462 | %o:file Output file, left in the command line 2463 | #o:file Output file, removed the command line before executing 2464 | < file Input file, left in the command line 2465 | > file Output file, left in the command line 2466 | %!>file Output, but don't set as a dependency 2467 | 2468 | For Example: 2469 | grun -M "gzip %i:x.foo #o>x.foo.gz" 2470 | 2471 | If a command fails under Make Semantics, the output file(s) will be 2472 | set to FILE.failed. 2473 | 2474 | NFS sync works better with make semantics. 2475 | 2476 | EOF 2477 | 2478 | $u .= <<'EOF' if $qinfo; 2479 | 2480 | Query Options: 2481 | -a|ll Query all nodes 2482 | -n|odes ($master) List of nodes to query 2483 | 2484 | Query Commands: 2485 | [-]status List nodes (q) 2486 | [-]jobs List jobs (q) 2487 | [-]history List prior jobs (q) 2488 | [-]conf Dump config from memory (q,e) 2489 | EOF 2490 | 2491 | $u .= <<'EOF' if $daemon; 2492 | 2493 | Daemon Options: 2494 | -h|osts (local) One or more hosts 2495 | -r|eload Reload config 2496 | -k|ill Kill running server 2497 | -R|ESTART Kill and restart a running server 2498 | 2499 | Without an option, -d just starts the daemon on the local machine. 2500 | EOF 2501 | $u .= <<'EOF' if $editjob; 2502 | 2503 | Edit Keys: 2504 | hold Hold job (no value needed) 2505 | resume Resume job 2506 | memory=N Memory in MB 2507 | cpus=N # of Cpus needed 2508 | EOF 2509 | 2510 | $u .= <<'EOF' if !$make; 2511 | 2512 | Common Options: 2513 | -C FILE (/etc/grun.conf) Config file location 2514 | -t|race Turn on debugging in the log file 2515 | -V Print version and exit 2516 | -? Show this help page 2517 | EOF 2518 | 2519 | $u .= <<'EOF' if defined($config) && $config eq ''; 2520 | 2521 | Configuration File: 2522 | 2523 | All config variables written as {value} are interpreted as perl code, and get evaluated at startup. 2524 | 2525 | The "include" varialbe actually just includes the file specified, as if it were part of the original file. 2526 | 2527 | All non-code configuration variables can include '$varname', which gets expanded to the value of another config var. 2528 | 2529 | Be careful with match code. It it's slow, it will kill the performance of your main node. 2530 | 2531 | Common variables: 2532 | 2533 | master (localhost) Hostname[:port] of master node 2534 | spool (/var/spool/grun) Location for queue & io temp storage 2535 | log_file Location of the log 2536 | services Must be 'queue' and/or 'exec' 2537 | port Port to listen on (5184) 2538 | bind[:port] Address to bind to (0.0.0.0) 2539 | trace Turn tracing on for the whole server 2540 | 2541 | Queue config vars: 2542 | 2543 | env (PATH) List of environment varialbes to copy to the processes. An asterisk (*) means 'ALL' 2544 | expire_secs (0) If set, jobs that aren't pinged in time get (failed or retried) 2545 | expire_action (retry) Can be 'retry', 'fail' 2546 | idle_load (.3) If load is less than this amount, then considered idle 2547 | io_keep (3600) Time to keep unretrieved stdio files (0=forever) 2548 | log_file Where to send "xlog" output 2549 | pid_file (/var/run/grun.pid) 2550 | ping_secs (30) Nodes ping the master this often. 2551 | ping_expire (2*$ping_secs) Drop a node if it doesn't ping in time 2552 | 2553 | Cli vars & defaults: 2554 | 2555 | nfs_sync (1) Whether to force-sync the directory cache after a job is run 2556 | default_cpu (1) Default cpu reservation 2557 | default_memory (1m) Default memory for jobs 2558 | default_priority (20) Default priority for jobs 2559 | 2560 | Execution node config vars: 2561 | 2562 | match Perl code that must eval to TRUE for a node match 2563 | full_match (1) If jobs queue is full, this is evaluated 2564 | full_exec If full match returns true, then this command is run 2565 | wrap Job command wrapper 2566 | 2567 | EOF 2568 | return $u; 2569 | } 2570 | 2571 | sub showconf { 2572 | return pretty_encode(\%conf); 2573 | } 2574 | 2575 | sub showhist { 2576 | my %opt; 2577 | 2578 | $opt{fmt} = '%jid\t%user\t%stat\t%cwd\t%cmd\t%host\t%mtime\n'; 2579 | 2580 | { 2581 | local @ARGV = @_; 2582 | GetOptions(\%opt, "count|c=i", "user=s","job=i@","resubmit","fmt|F=s","long","grep|g=s","dump"); 2583 | @_=@ARGV; 2584 | } 2585 | 2586 | if ($_[0] =~ /[a-z]/ && !$opt{user}) { 2587 | $opt{user} = $_[0] 2588 | } else { 2589 | while ($_[0] =~ /^\d+$/) { 2590 | push @{$opt{job}}, $_[0]; 2591 | shift; 2592 | } 2593 | } 2594 | 2595 | my $r; # the result 2596 | 2597 | my $count = $opt{count}; 2598 | $count = 10 if !$count; 2599 | my @J; 2600 | if ($opt{job}) { 2601 | for (@{$opt{job}}) { 2602 | my $f = (jhistpath($_)); 2603 | if ( -e $f ) { 2604 | my $t=unpack_file($f); 2605 | my ($jid) = $f =~ /\/(.+)$/; 2606 | $t->{jid}=$jid; 2607 | my $mtime = (stat($f))[9]; 2608 | $t->{mtime}=$mtime; 2609 | push @J, $t; 2610 | } else { 2611 | xlog("error", "History for job $_ requested, but file not found\n"); 2612 | } 2613 | } 2614 | } else { 2615 | my $k = 5; 2616 | my @mx = ((-1) x $k); 2617 | opendir(D,"$conf{spool}/jhist"); 2618 | while(defined ($_=readdir(D))) { 2619 | next unless /^\d+$/; 2620 | for (my $i = 0; $i < $k; ++$i) { 2621 | if ($_ > $mx[$i]) { 2622 | # shift everything up 2623 | my $top=$i++; 2624 | for (; $i < $k; ++$i) { 2625 | $mx[$i]=$mx[$i-1]; 2626 | } 2627 | $mx[$top]=$_; 2628 | last; 2629 | } 2630 | } 2631 | } 2632 | closedir(D); 2633 | 2634 | for (my $i = 0; $i < $k; ++$i) { 2635 | if (@J < $count) { 2636 | opendir(D,"$conf{spool}/jhist/$mx[$i]"); 2637 | my @T = readdir(D); 2638 | @T = sort {$b <=> $a} @T; 2639 | closedir(D); 2640 | # prepend dir 2641 | for my $jid (@T) { 2642 | next unless $jid=~/^\d/; 2643 | my $f = "$conf{spool}/jhist/$mx[$i]/$jid"; 2644 | my $job=eval{unpack_file($f)}; 2645 | next unless ref($job); 2646 | 2647 | # support array/text formats 2648 | my @cmd=ref $job->{cmd} ? @{$job->{cmd}} : ($job->{cmd}); 2649 | 2650 | # user supplied filter 2651 | next if $opt{user} && ! ($job->{user} eq $opt{user}); 2652 | next if $opt{grep} && "$job->{user}\t@cmd\t$job->{cwd}" !~ $opt{grep}; 2653 | 2654 | my $mtime = (stat($f))[9]; 2655 | 2656 | next unless $mtime; 2657 | 2658 | $job->{jid}=$jid; 2659 | $job->{mtime}=$mtime; 2660 | 2661 | push @J, $job; 2662 | last if @J >= $count; 2663 | } 2664 | } 2665 | } 2666 | } 2667 | 2668 | for my $job (@J) { 2669 | my $jid=$job->{jid}; 2670 | my $mtime = $job->{mtime}; 2671 | 2672 | my @cmd=ref $job->{cmd} ? @{$job->{cmd}} : ($job->{cmd}); 2673 | next if $opt{user} && ! ($job->{user} eq $opt{user}); 2674 | next if $opt{grep} && "$job->{user}\t@cmd\t$job->{cwd}" !~ $opt{grep}; 2675 | --$count; 2676 | my %job=%{$job}; 2677 | 2678 | $job{status} = $job->{usage}->{status} if $job->{usage} && ! $job->{status}; 2679 | $job{status} = ($job{status} >> 8) if 0 == ($job{status} & 0xFF); 2680 | # standard 2681 | $job{status} = 'OK' if $job{status} eq 0; 2682 | $job{status} = 'KILLED' if $job{status} eq 199; 2683 | $job{status} = 'INT' if $job{status} eq 2; 2684 | $job{status} = 'ORPHAN' if $job{status} eq $STATUS_ORPHAN; 2685 | $job{status} = 'SIGSEGV' if $job{status} eq 11; 2686 | $job{status} = 'SIGPIPE' if $job{status} eq 13; 2687 | $job{status} = 'SIGFPE' if $job{status} eq 8; 2688 | 2689 | # linux specific 2690 | $job{status} = 'NOTFOUND' if $job{status} eq 127; 2691 | $job{status} = 'ASSERT' if $job{status} eq 134; 2692 | $job{status} = 'OUTOFMEM' if $job{status} eq 137; 2693 | $job{status} = 'ALTSEGV' if $job{status} eq 139; 2694 | $job{stat} = $job{status}; 2695 | 2696 | $job{wait} = $job{usage}->{start_time}-$job{time}; 2697 | 2698 | my $cmd = join(' ', @cmd); 2699 | $cmd =~ s/\n\s+/\n/g; 2700 | $cmd =~ s/^\s+//; 2701 | $cmd =~ s/\s+$//; 2702 | $cmd =~ s/\n/;/g; 2703 | if ($nodes{$job{host}} && $nodes{$job{host}}->{hostname}) { 2704 | $job{host}=$nodes{$job{host}}->{hostname} 2705 | } elsif (-e (my $n="$conf{spool}/nodes/$job{host}.reg")) { 2706 | my $node=unpack_file($n); 2707 | $job{host}=$node->{hostname}; 2708 | } 2709 | $job{jid}=$jid; 2710 | $job{cmd}=$cmd; 2711 | # $job{env}=join ':' . map { $_.'='.$job{env}{$_} } keys %{$job{env}}; 2712 | $job{mtime}=fmtime($mtime); 2713 | if (ref($job{usage})) { 2714 | for (keys(%{$job{usage}})) { 2715 | $job{"usage_".$_}=$job{usage}{$_}; 2716 | } 2717 | delete $job{usage}; 2718 | } 2719 | if ($opt{long}) { 2720 | $r .= "----------\n"; 2721 | for(sort(keys(%job))) { 2722 | $r .= "$_=" . (ref($job{$_})?packdump($job{$_}):$job{$_}) . "\n"; 2723 | } 2724 | } elsif ($opt{dump}) { 2725 | $r .= packdump($job) . "\n"; 2726 | } else { 2727 | $r .= fmtstr($opt{fmt}, \%job); 2728 | } 2729 | 2730 | last if $count == 0; 2731 | } 2732 | return $r; 2733 | } 2734 | 2735 | sub fmtime { 2736 | my ($t) = @_; 2737 | return strftime("%m/%d %H:%M",localtime($t)); 2738 | } 2739 | 2740 | # grun -q status 2741 | sub shownodes { 2742 | my $r; 2743 | $r .= sprintf "%-15s %-14s %-8s Bench\n", 'Hostname','Memory', 'Cpu'; 2744 | my @nodes = getnodes(); 2745 | for my $node (sort {$a->{hostname} cmp $b->{hostname}} @nodes) { 2746 | if ($node->{ping} > time() - ($conf{ping_secs} * 2) ) { 2747 | chomp($node->{hostname}); 2748 | $node->{hostname} = substr($node->{hostname},0,15); 2749 | 2750 | my $cpus = $node->{cpus} - $node->{a_cpus}; 2751 | $cpus = $node->{avail} if $node->{avail} < $cpus; 2752 | 2753 | my $mem = $node->{tmem} - $node->{a_memory}; 2754 | $mem = $node->{mem} if $node->{tmem} > 0 && $node->{mem} < $mem; 2755 | 2756 | $r .= sprintf "%-15s %6dm/%-6d %4.1f/%-2d %3d\n", $node->{hostname}, $node->{mem}/1000, $node->{tmem}/1000,$node->{avail},$node->{cpus},$node->{bench}; 2757 | } 2758 | } 2759 | return $r; 2760 | } 2761 | 2762 | # grun -q jobs 2763 | sub showjobs { 2764 | my %opt; 2765 | 2766 | $opt{fmt} = '%s:jid\t%s:user\t%s:stat\t%s:cwd\t%s:cmd\n'; 2767 | { 2768 | local @ARGV = @_; 2769 | GetOptions(\%opt, "dump|d", "user=s","job=i","fmt|F=s","long","debug"); 2770 | @_=@ARGV; 2771 | } 2772 | 2773 | if ($_[0]) { 2774 | my $user = shift @_; 2775 | if ($user =~ /^\d+$/) { 2776 | $opt{job}=$user; 2777 | } else { 2778 | $opt{user}=$user if !($user eq ''); 2779 | } 2780 | } 2781 | 2782 | xlog("debug", "Show job for user:$opt{user}, job:$opt{job}\n"); 2783 | 2784 | my $r; 2785 | # $r .= sprintf "%s\t%s\t%s\t%s\t%s\n", 'JobID','User','Host','Cwd','Command'; 2786 | 2787 | my $now = time(); 2788 | opendir(D,"$conf{spool}/jobs"); 2789 | while(my $jid=readdir(D)) { 2790 | next if $jid =~ /\.ip$/; 2791 | next if $jid =~ /\.ok$/; 2792 | my $f = "$conf{spool}/jobs/$jid"; 2793 | next unless -f $f; 2794 | my $job = unpack_file($f); 2795 | if (ref($job)) { 2796 | my %job=%{$job}; 2797 | next if $opt{user} && ! ($opt{user} eq $job{user}); 2798 | next if $opt{job} && ! ($jid =~ /^$opt{job}\b/); 2799 | my $stat = '(I)'; 2800 | $stat = '(H)' if $job->{state} eq 'hold'; 2801 | if ($jid =~ s/:([.\d]+?)\.run$//) { 2802 | my $ip = $1; 2803 | my $node = unpack_file("$conf{spool}/nodes/$ip.reg"); 2804 | $stat = $node->{hostname}; chomp $stat; 2805 | $job->{host} = $stat; 2806 | # ip file is created at start 2807 | # run file is updated regularly to prevent orphans 2808 | if ( -e "$conf{spool}/jobs/$jid.ip" ) { 2809 | $job->{start} = (stat("$conf{spool}/jobs/$jid.ip"))[9]; 2810 | } 2811 | $stat .= ' (S)' if $job->{state} eq 'susp'; 2812 | } else { 2813 | $stat = '(S)' if $job->{state} eq 'susp'; 2814 | } 2815 | # trim for display 2816 | my @cmd = @{$job->{cmd}}; 2817 | for (@cmd) { 2818 | s/^\s+//g; 2819 | s/\n\s*/;/g; 2820 | } 2821 | $job->{wait} = ($job->{start}?$job->{start}:$now)-$job->{time}; 2822 | $job->{jid} = $jid; 2823 | $job->{stat} = $stat; 2824 | $job->{cpus} = 1 if ! $job->{cpus}; 2825 | $job->{memory} = $conf{default_memory} if ! $job->{memory}; 2826 | $job->{priority} = $conf{default_priority} if ! $job->{priority}; 2827 | $job->{cmd} = join(' ', @cmd); 2828 | if ($opt{long}) { 2829 | $r .= "----------\n"; 2830 | for(sort(keys(%$job))) { 2831 | $r .= "$_=" . (ref($job->{$_})?packdump($job->{$_}):$job->{$_}) . "\n"; 2832 | } 2833 | } elsif ($opt{dump}) { 2834 | $r .= packdump($job) . "\n"; 2835 | } else { 2836 | $r .= fmtstr($opt{fmt}, $job); 2837 | } 2838 | } 2839 | } 2840 | closedir(D); 2841 | return $r; 2842 | } 2843 | 2844 | sub fmtstr { 2845 | my ($fmt, $hash) = @_; 2846 | 2847 | # get list of fields 2848 | my @fds = $fmt =~ m/%(?:[#0 +,I:-]*(?:\d+)?(?:\.\d+)?\w{1,2}:)?([\w-]+|(?:{[^{}]+}))/g; 2849 | 2850 | %{$safe->varglob("i")}=%{$hash}; 2851 | my @vals; 2852 | for (@fds) { 2853 | if ($_ =~ /^\{(.+)\}/) { 2854 | push @vals, $safe->reval($1); 2855 | } else { 2856 | if (ref($hash->{$_}) eq 'HASH') { 2857 | push @vals, packdump($hash->{$_}); 2858 | } else { 2859 | push @vals, $hash->{$_}; 2860 | } 2861 | } 2862 | } 2863 | undef %{$safe->varglob("i")}; 2864 | 2865 | # replace formatted - with format-only 2866 | $fmt =~ s/(%[#0 +,I:-]*(?:\d+)?(?:\.\d+)?\w{1,2}):([\w-]+|({[^{}]+}))/$1/g; 2867 | 2868 | my $fds=join '|', map quotemeta($_), @fds; 2869 | # replace pure-fields with field-only 2870 | $fmt =~ s/%($fds)/%s/g; 2871 | # expand vars 2872 | 2873 | $fmt =~ s/\\n/\n/g; 2874 | $fmt =~ s/\\t/\t/g; 2875 | $fmt =~ s/\\r/\r/g; 2876 | 2877 | # format the rest 2878 | return sprintf($fmt, @vals); 2879 | } 2880 | 2881 | sub bestunique { 2882 | my ($c, @c) = @_; 2883 | my $b; 2884 | for (@c) { 2885 | if ($_ =~ /^$c/) { 2886 | return undef if ($b); 2887 | $b = $_; 2888 | } 2889 | } 2890 | return $b; 2891 | } 2892 | 2893 | # returns an array of [host,port], given a list of host[:port] names 2894 | sub expandnodes { 2895 | my @r; 2896 | my @n; 2897 | for (split(/[\s,]/, join ' ', @_)) { 2898 | my ($h, $p) = m/^(.*)(:\d+)?$/; 2899 | $p = $conf{port} if !$p; 2900 | if ($h =~ s/\*/\.\*/g) { 2901 | if (!@n) { 2902 | @n=getnodes(); 2903 | } 2904 | for (@n) { 2905 | push @r, [$_->{hostname}, $p] if $_->{hostname} =~ /$h/; 2906 | } 2907 | } else { 2908 | push @r, [$h, $p]; 2909 | } 2910 | } 2911 | return @r; 2912 | } 2913 | 2914 | sub evalctx { 2915 | my ($expr, @ctx) = @_; 2916 | my $msafe = new Safe; 2917 | $msafe->permit(qw(:subprocess :filesys_read)); # allow backticks 2918 | for(my $i = 0; $i < @ctx; $i+=2) { 2919 | my ($name, $var) = ($ctx[$i], $ctx[$i+1]); 2920 | # references to hashes/arrays become dereferenced hashes/arrays 2921 | if (ref($var) eq 'HASH') { 2922 | %{$msafe->varglob($name)}=%{$var}; 2923 | } elsif (ref($var) eq 'ARRAY') { 2924 | @{$msafe->varglob($name)}=@{$var}; 2925 | } else { 2926 | ${$msafe->varglob($name)}=$var; 2927 | } 2928 | } 2929 | $msafe->share(qw(%conf %ENV)); 2930 | # if ($conf{trace}) { 2931 | # xlog("debug", "Evaluating {$expr}\n"); 2932 | # } 2933 | my ($res, @res); 2934 | if (wantarray) { 2935 | @res = $msafe->reval($expr); 2936 | } else { 2937 | $res = $msafe->reval($expr); 2938 | } 2939 | my $save = $@; 2940 | if ($@) { 2941 | xlog("error", "Error evaluating {$expr} : $@\n"); 2942 | return undef; 2943 | } 2944 | $@=$save; 2945 | if (wantarray) { 2946 | return @res; 2947 | } else { 2948 | return $res; 2949 | } 2950 | } 2951 | 2952 | sub kicknodes { 2953 | my $did=0; 2954 | my @nodes = getnodes(cached=>1); 2955 | return if !$conf{kickstart}; 2956 | return if (time() < ($conf{lastkicktime} + $conf{ping_secs} * 4)); 2957 | $conf{lastkicktime} = time(); 2958 | for my $node (@nodes) { 2959 | # not just started 2960 | if ($conf{loop_num} > 5) { 2961 | # sometime in the last 24 hours? 2962 | if ($node->{ping} > (time() - ($conf{remove_secs}))) { 2963 | # but not in the last couple minutes 2964 | if ($node->{ping} < (time() - ($conf{ping_secs} * 4)) ) { 2965 | # kick it 2966 | $did=1; 2967 | xlog("note", "Kicking node $node->{hostname}\n"); 2968 | if (!fork) { 2969 | eval { 2970 | zmq_fork_undef(); 2971 | # be sure this is gone 2972 | my $cmd = $conf{kickstart}; 2973 | if ($cmd =~ /^\{(.*)\}$/) { 2974 | $cmd = evalctx($1, node=>$node); 2975 | } 2976 | if ($cmd && $cmd !~ /\$/) { 2977 | exec("$cmd"); 2978 | } 2979 | }; 2980 | exit(0); 2981 | } 2982 | } 2983 | } 2984 | } 2985 | } 2986 | return $did; 2987 | } 2988 | 2989 | sub getnodes { 2990 | my (%opt) = @_; 2991 | my @r; 2992 | if (!$opt{cached} && %nodes) { 2993 | # return memcached values 2994 | return values %nodes; 2995 | } 2996 | # read all from disk, including old ones 2997 | opendir(D,"$conf{spool}/nodes"); 2998 | while($_=readdir(D)) { 2999 | $_ = "$conf{spool}/nodes/$_"; 3000 | next unless -f $_; 3001 | my $node=eval{unpack_file($_)}; 3002 | if ($node) { 3003 | if (! defined $node->{avail}) { 3004 | $node->{avail} = $node->{cpus}-$node->{load}; 3005 | } 3006 | push @r, $node; 3007 | } 3008 | } 3009 | closedir D; 3010 | return @r; 3011 | } 3012 | 3013 | sub startdaemon { 3014 | my ($dkill, $restart, $reload, $node, $all, $dobench, $nofork); 3015 | 3016 | GetOptions("kill"=>\$dkill, "restart|R"=>\$restart, "benchmark|B"=>\$dobench, "reload|r"=>\$reload, "host=s"=>\$node, "all"=>\$all, "nofork|F"=>\$nofork) || 3017 | die usage(); 3018 | 3019 | $node = '*' if $all; 3020 | 3021 | if ($reload || $node) { 3022 | my @n = $node ? expandnodes($node) : ([$conf{master},$conf{master_port}]); 3023 | 3024 | my $stat = 0; 3025 | my $xcmd = $reload ? 'relo' : $restart ? 'rest' : $dkill ? 'term' : ''; 3026 | if (!$xcmd) { 3027 | die usage(); 3028 | } 3029 | for (@n) { 3030 | my ($res) = waitmsg($_->[0], $_->[1], "xcmd", $xcmd); 3031 | if (! defined $res) { 3032 | print "$@\n"; 3033 | $stat = 1; 3034 | } else { 3035 | print "$res\n"; 3036 | } 3037 | } 3038 | exit $stat; 3039 | } 3040 | 3041 | if ($dobench) { 3042 | print STDERR "Bench\t" . int(getbench(1)) . "\n"; 3043 | exit 0; 3044 | } 3045 | 3046 | if ($restart) { 3047 | $daemon = 1; 3048 | } 3049 | 3050 | killgpid() if $dkill || $restart; 3051 | exit 0 if $dkill; 3052 | sleep 1 if $restart; 3053 | 3054 | # start daemon 3055 | if ($gpid) { # already running? 3056 | if (kill(0, $gpid)) { 3057 | die "Already running ($gpid), not starting twice\n"; 3058 | } 3059 | } 3060 | xlog("note", "Starting daemon as " . getpwuid($<)); 3061 | 3062 | my $limit = `bash -c "ulimit -v -f -t -m -s"`; 3063 | 3064 | if (@{[($limit =~ m/unlimited$/mg)]}!=5) { 3065 | my @n=qw(virt file time mem stack); 3066 | my @v=$limit =~ m/(\w+)$/mg; 3067 | my %d; 3068 | map {$d{$n[$_]}=$v[$_] if ! ($v[$_] eq 'unlimited')} (0..4); 3069 | warn "Note: ulimit is not unlimited for daemon (" . join(' ',%d) . ")\n"; 3070 | } 3071 | 3072 | if ($conf{ulimit}) { 3073 | if (eval {require BSD::Resource;}) { 3074 | local @ARGV = split / /, $conf{ulimit}; 3075 | Getopt::Long::Configure qw(no_require_order no_ignore_case); 3076 | my %opt; 3077 | GetOptions(\%opt,"f=i","v=i","m=i","t=i"); 3078 | if (@ARGV) { 3079 | die "Options @ARGV not supported by grun, use a regular ulimit wrapper"; 3080 | } 3081 | no strict "subs"; 3082 | BSD::Resource::setrlimit(BSD::Resource::RLIMIT_FSIZE(),$opt{f},$opt{f}) if ($opt{f}); 3083 | BSD::Resource::setrlimit(BSD::Resource::RLIMIT_VMEM(),$opt{v},$opt{v}) if ($opt{v}); 3084 | BSD::Resource::setrlimit(BSD::Resource::RLIMIT_RSS(),$opt{m},$opt{m}) if ($opt{m}); 3085 | BSD::Resource::setrlimit(BSD::Resource::RLIMIT_CPU(),$opt{t},$opt{t}) if ($opt{t}); 3086 | } else { 3087 | die "Please install BSD::Resource to use the 'ulimit' option in $def{config}\n"; 3088 | } 3089 | } 3090 | 3091 | zmq_safe_term(); 3092 | 3093 | if ($nofork || (!($gpid=fork))) { 3094 | zmq_fork_undef() if !$nofork; 3095 | 3096 | $log_to_stderr = 1 if $nofork; 3097 | $gpid = $$ if $nofork; 3098 | die "Can't fork child process\n" if (! defined $gpid); 3099 | 3100 | $context=zmq_init(); 3101 | open (P, ">$conf{pid_file}") || die "Can't open pidfile '$conf{pid_file}' : $!\n"; 3102 | print P $$; 3103 | close P; # save pid 3104 | 3105 | if (!$nofork) { 3106 | open STDIN, '/dev/null'; 3109 | open STDERR, '>&STDOUT'; 3110 | } 3111 | } 3112 | 3113 | POSIX::setsid(); 3114 | 3115 | $SIG{INT} = sub { $quit = 1; }; 3116 | $SIG{TERM} = $SIG{INT}; 3117 | $SIG{HUP} = \&init; 3118 | 3119 | $router = zmq_socket($context, ZMQ_ROUTER); 3120 | 3121 | zmq_bind($router, "tcp://$conf{bind}:$conf{port}") 3122 | and die "Can't make ZMQ router on port $conf{port}: $!"; 3123 | 3124 | $quit = 0; 3125 | $conf{loop_num} = 0; 3126 | while (!$quit) { 3127 | my $did=0; 3128 | my $start = Time::HiRes::time(); 3129 | ++$conf{loop_num}; 3130 | eval {$did|=readsocks()}; 3131 | last if $quit; 3132 | xlog("error", "Daemon $$ readsocks exception: $@") if $@; 3133 | 3134 | # theoretically these could be in separate threads, or forked off 3135 | if ( $conf{services}->{queue} ) { 3136 | eval {$did|=schedule()}; 3137 | xlog("error", "Daemon $$ schedule exception: $@") if $@; 3138 | } 3139 | 3140 | if ( $conf{services}->{exec} ) { 3141 | eval {$did|=srvexec()}; 3142 | xlog("error", "Daemon $$ srvexec exception: $@") if $@; 3143 | }; 3144 | 3145 | if (!$did) { 3146 | my $elapsed = Time::HiRes::time()-$start; 3147 | if ($elapsed < .25) { 3148 | # fractional sleep 3149 | # xlog("debug", "Did nothing, fractional sleep for " . (.25-$elapsed)); 3150 | select(undef, undef, undef, .25-$elapsed); 3151 | } 3152 | } 3153 | # print "HERE\n"; 3154 | } 3155 | xlog("note", "Shutdown"); 3156 | zmq_unbind($router, "tcp://$conf{bind}:$conf{port}"); 3157 | sleep(1); 3158 | eval {readsocks()}; 3159 | zmq_safe_term(); 3160 | unlink $conf{pid_file}; 3161 | } 3162 | exit 0; 3163 | } 3164 | 3165 | sub END { 3166 | zmq_safe_term(); 3167 | } 3168 | 3169 | sub zmq_safe_term() { 3170 | # if i'm the parent pid 3171 | if ($router) { 3172 | zmq_setsockopt($router, ZMQ_LINGER, 1); 3173 | zmq_unbind($router,"tcp://$conf{bind}:$conf{port}"); 3174 | zmq_close($router); 3175 | $router=undef; 3176 | } 3177 | # these are all the execution nodes... if any 3178 | for (values(%ZMQS)) { 3179 | zmq_setsockopt($_, ZMQ_LINGER, 1); 3180 | zmq_close($_); 3181 | } 3182 | %ZMQS=(); 3183 | if ($context) { 3184 | zmq_term($context); 3185 | $context=undef; 3186 | } 3187 | } 3188 | 3189 | sub zmq_fork_undef() { 3190 | # close all duped file descriptors 3191 | opendir(D,"/proc/$$/fd"); 3192 | while(my $fd = readdir(D)) { 3193 | if ($fd > 2 && (POSIX::lseek($fd, 0, POSIX::SEEK_CUR) == -1)) { 3194 | if (POSIX::ESPIPE == POSIX::errno()) { 3195 | POSIX::close($fd) if $fd > 2; 3196 | } 3197 | } 3198 | } 3199 | closedir(D); 3200 | # kill access to any forked sockets 3201 | $router=undef; 3202 | %ZMQS=(); 3203 | $context=undef; 3204 | } 3205 | 3206 | 3207 | sub killgpid { 3208 | die "Can't find pid $conf{pid_file} for daemon\n" if !$gpid; 3209 | if (!kill(2, $gpid)) { 3210 | die "Can't kill -INT $gpid: $!\n"; 3211 | } 3212 | sleep 1; 3213 | $gpid = 0; 3214 | } 3215 | 3216 | sub samehost { 3217 | my($h1, $h2) = @_; 3218 | $h1 =~ s/\s+$//; 3219 | $h2 =~ s/\s+$//; 3220 | $h1=host2ip($h1); 3221 | $h2=host2ip($h2); 3222 | # localhost = dig `hostname` 3223 | $h1 =~ s/^(0\.0\.0\.0|127\.0\.0\.1)$/$conf{hostip}/; 3224 | $h2 =~ s/^(0\.0\.0\.0|127\.0\.0\.1)$/$conf{hostip}/; 3225 | return $h1 eq $h2; 3226 | } 3227 | 3228 | sub raw_host2ip { 3229 | my (@octets, $raw_addr, $ip); 3230 | return $_[0] if $_[0] =~ /^(\d+\.){3}\d+$/; 3231 | $raw_addr = (gethostbyname($_[0]))[4]; 3232 | @octets = unpack("C4", $raw_addr); 3233 | $ip = join(".", @octets); 3234 | return($ip); 3235 | } 3236 | 3237 | sub host2ip { 3238 | my ($test_addr, $tmpstr, $mDNSdomain); 3239 | return $_[0] if $_[0] =~ /^(\d+\.){3}\d+$/; 3240 | $test_addr = raw_host2ip($_[0]); 3241 | if (!$test_addr) { 3242 | $mDNSdomain = '.local'; 3243 | if (length $_[0] > length $mDNSdomain) { 3244 | $tmpstr = substr($_[0], -(length $mDNSdomain)); 3245 | if (!($tmpstr eq $mDNSdomain)) { 3246 | $test_addr = raw_host2ip($_[0] . $mDNSdomain); 3247 | } 3248 | } else { 3249 | $test_addr = raw_host2ip($_[0] . $mDNSdomain); 3250 | } 3251 | } 3252 | 3253 | return ($test_addr); 3254 | } 3255 | 3256 | sub packdump { 3257 | return encode_json($_[0]); 3258 | } 3259 | 3260 | sub kill_job { 3261 | my %op = @_; 3262 | my ($err) = waitmsg($conf{master}, $conf{master_port}, 'jkill', \%op); 3263 | if ($err =~ /Forward (\d+):?\s*([\d.]+):?(\d*)/i) { 3264 | my ($jid, $ip, $port) = ($1, $2, $3); 3265 | $port = $conf{port} if !$port; 3266 | ($err) = waitmsg($ip, $port, 'xabort', $jid, $op{sig}, $op{termio}); 3267 | } 3268 | return $err; 3269 | } 3270 | 3271 | sub proper { 3272 | my $x = shift; 3273 | $x=~ s/\b(\S)/uc($1)/eg; 3274 | return $x; 3275 | } 3276 | 3277 | sub execute_job { 3278 | my ($opts) = @_; 3279 | my @cmd = @{$opts->{cmd}}; 3280 | 3281 | my ($uid, $gid, $err, $ret); 3282 | if ($opts->{user}) { 3283 | (undef, undef, $uid, $gid) = getpwnam($opts->{user}); 3284 | } 3285 | if (defined($uid) && !$conf{run_asroot} && !$uid) { 3286 | $err="Won't run as root"; 3287 | } 3288 | if (!defined($uid)) { 3289 | $err="User $opts->{user} is unknown on this machine, not executing"; 3290 | } 3291 | if ($opts->{group}) { 3292 | $gid=$opts->{group}; 3293 | } 3294 | # expecting someone to come pick up output? 3295 | xlog("Creating io_wait for $opts->{id}") if $opts->{wait} || $opts->{io}; 3296 | $io_wait{$opts->{id}}->{type} = 'stat' if $opts->{wait}; # status wait 3297 | $io_wait{$opts->{id}}->{type} = 'io' if $opts->{io}; # io wait 3298 | $io_wait{$opts->{id}}->{time} = time() if $opts->{wait} || $opts->{io}; # io wait 3299 | 3300 | my $pid; 3301 | 3302 | if ($conf{wrap}) { 3303 | @cmd = ($conf{wrap}, @cmd); 3304 | } 3305 | 3306 | my $pfile = "$conf{spool}/jpids/$opts->{id}"; 3307 | 3308 | if (-e $pfile) { 3309 | xlog($err="Job file $pfile already exists, not executing."); 3310 | } else { 3311 | if (!open(PF, ">$pfile")) { 3312 | xlog($err="Can't create $pfile: $!"); 3313 | } 3314 | } 3315 | 3316 | my $bfile = "$conf{spool}/jstat/$opts->{id}"; 3317 | my $tfile = "$bfile.stat"; 3318 | my $ifile = "$bfile.job"; 3319 | 3320 | $opts->{uid}=$uid; 3321 | $opts->{gid}=$gid; 3322 | 3323 | # job info, saved 3324 | burp($ifile, packfile($opts)); 3325 | chown($uid,0,$ifile); 3326 | 3327 | if (!open(XOUT, ">", "$tfile")) { 3328 | $err=$!; 3329 | $ret=$?; 3330 | } 3331 | chown($uid,0,$tfile); 3332 | # leave the file open for fork 3333 | 3334 | if (!$err && !($pid=fork)) { 3335 | if (! defined $pid) { 3336 | $err = "Can't fork"; 3337 | } else { 3338 | $0="GRUN:$opts->{id}"; 3339 | zmq_fork_undef(); 3340 | 3341 | # restore signal to default ... regular kill 3342 | $SIG{INT} = undef; 3343 | $SIG{TERM} = undef; 3344 | $ENV{USER} = $opts->{user}; 3345 | 3346 | # kill me with a negative number, all kids die too 3347 | # kill my parent.... i stay alive, and my IO is still ready to go 3348 | my $pgid=POSIX::setsid(); 3349 | xlog("debug", "PGID set to $pgid\n"); 3350 | 3351 | # copy in the umask & the environment 3352 | umask $opts->{umask}; 3353 | for (keys(%{$opts->{env}})) { 3354 | $ENV{$_}=$opts->{env}->{$_}; 3355 | } 3356 | 3357 | for (keys(%{$opts})) { 3358 | next if ref($opts->{$_}); 3359 | $ENV{"_GRUN_OPT_$_"} = $opts->{$_}; 3360 | } 3361 | $ENV{"_GRUN"} = $opts->{id}; 3362 | $ENV{"SGE_TASK_ID"} = $opts->{id} if (!$ENV{SGE_TASK_ID}); 3363 | 3364 | my ($err, $ret); 3365 | 3366 | my $shfile = "$bfile.sh"; 3367 | if (!open(SHFILE, ">", $shfile)) { 3368 | $err=$!; 3369 | $ret=$?; 3370 | } 3371 | chown($uid,0,$shfile); 3372 | 3373 | my $hard_factor = defined($opts->{hard_factor}) ? $opts->{hard_factor} : $conf{hard_factor}; 3374 | if ($hard_factor && $opts->{memory}) {eval{ 3375 | # add 4mb for o/s stuff to load 3376 | print SHFILE "ulimit -v " . (4000+int(($opts->{memory} * $conf{hard_factor}))) . "\n" 3377 | if $opts->{memory}; 3378 | }} 3379 | 3380 | if ($cmd[0] !~ / /) { 3381 | for (@cmd) { 3382 | if ($_ =~ / /) { 3383 | $_ =~ s/"/\\"/g; 3384 | $_ = '"' . $_ . '"'; 3385 | } 3386 | } 3387 | } 3388 | 3389 | print SHFILE join(" ", @cmd), "\n"; 3390 | close SHFILE; 3391 | 3392 | xlog("debug", "Wrote $shfile\n") if $opts->{trace}; 3393 | 3394 | if (!$err) { 3395 | xlog("debug", "Setting uid to $uid, gid to $gid.\n") if $opts->{trace}; 3396 | eval { 3397 | if ($gid) { 3398 | $) = $gid; 3399 | $( = $gid; 3400 | } 3401 | $> = $uid; 3402 | $< = $uid; 3403 | if ($opts->{cwd}) { 3404 | if (!chdir($opts->{cwd})) { 3405 | $err = "Can't cd to $opts->{cwd} : $!"; 3406 | $ret = 103; 3407 | } 3408 | } 3409 | }; 3410 | } 3411 | 3412 | if (!$err && $@) { 3413 | $ret = 102; 3414 | $err = "Error setting uid to $uid: $@\n"; 3415 | } 3416 | 3417 | xlog("debug", "About to launch (@cmd)\n") if $opts->{trace}; 3418 | 3419 | if (!$err) { 3420 | my $confarg = "-C $conf{config}"; # same config! 3421 | # this frees all ram... yay, finally 3422 | exec("$GRUN_PATH $confarg -Y $bfile"); 3423 | $err = "Can't exec: $!"; 3424 | $pid=-1; 3425 | } else { 3426 | eval { 3427 | # save output 3428 | 3429 | # immediate reply if possible 3430 | xlog("debug", "Error before launch: $ret \"$err\" (@cmd)\n") if $opts->{trace}; 3431 | my $out = { 3432 | id=>$opts->{id}, 3433 | status=>$ret, 3434 | error=>$err, 3435 | dumped=>1, 3436 | }; 3437 | print XOUT packfile($out); 3438 | close XOUT; 3439 | $context=zmq_init(); 3440 | send_status_for_job($out); 3441 | printf STDERR "$err\n"; 3442 | }; 3443 | if ($@) { 3444 | xlog("error", "Error reporting error : $@\n"); 3445 | } 3446 | exit $ret; 3447 | } 3448 | } 3449 | } 3450 | 3451 | # fake pid 3452 | if ($err) { 3453 | $ret = $STATUS_EXECERR if !$ret; 3454 | $pid = 'e' . $opts->{id}; 3455 | xlog("note", "Error '$err' with job $opts->{id}, pid '$pid'"); 3456 | my $out = { 3457 | id=>$opts->{id}, 3458 | status=>$ret, 3459 | error=>$err, 3460 | dumped=>1, 3461 | }; 3462 | print XOUT packfile($out); 3463 | close XOUT; 3464 | 3465 | xlog("debug", "FILE: $tfile"); 3466 | 3467 | notifystat($out, 1, 1); 3468 | # special case... exec failed 3469 | exit 1 if $pid == -1; 3470 | } else { 3471 | xlog("note", "Started job $opts->{id}, pid $pid, '@cmd'") if !$err; 3472 | close XOUT; 3473 | # record pid for wait 3474 | $pid_jobs{$pid}->{jid}=$opts->{id}; 3475 | $pid_jobs{$pid}->{time}=time(); 3476 | } 3477 | 3478 | $opts->{pid} = $pid; 3479 | $opts->{err} = $err; 3480 | print PF packfile($opts); 3481 | close PF; 3482 | 3483 | # ok we've officially either a) started this job or b) notified of failure at this point.... 3484 | sendcmd_nowait($conf{master}, $conf{port}, 'jexok', { map { $_ => $opts->{$_} } qw/id pid uid gid err/ }); 3485 | } 3486 | 3487 | sub send_status_for_job { 3488 | my ($stat, %op) = @_; 3489 | # this could be in a fork... be safe 3490 | my $sock = zmq_socket($context, ZMQ_DEALER); 3491 | xlog("note", "Sending status for $stat->{id} as $stat->{status}\n") if $conf{trace}; 3492 | zmq_connect($sock,"tcp://$conf{bind}:$conf{port}"); 3493 | zmq_send($sock, "", 0, ZMQ_SNDMORE); 3494 | zmq_send($sock, packcmd("sstat", $stat)); 3495 | zmq_close($sock); 3496 | zmq_term($context); 3497 | } 3498 | 3499 | sub jid_from_opts { 3500 | my ($job) =@_; 3501 | return $job->{jid} ? $job->{jid} : jid_from_guid($job->{guid}); 3502 | } 3503 | 3504 | sub jid_from_guid { 3505 | my ($guid) = @_; 3506 | if (-s "$conf{spool}/guids/$guid") { 3507 | return slurp("$conf{spool}/guids/$guid"); 3508 | } 3509 | } 3510 | 3511 | sub create_guid { 3512 | substr(Data::UUID->new()->create_hex(),2) 3513 | } 3514 | 3515 | sub exec_clean { 3516 | my ($jid) = @_; 3517 | if (!-e "$conf{spool}/jpids/$jid") { 3518 | xlog("debug", "Clean $jid which is already gone"); 3519 | } else { 3520 | xlog("debug", "Cleaning $jid") if $conf{trace}; 3521 | # stop tracking this pid, head node got all the info 3522 | my $job = unpack_file("$conf{spool}/jpids/$jid"); 3523 | # nobody asked for the job info, so don't keep it around after exec node knows about it 3524 | unlink("$conf{spool}/jpids/$jid"); 3525 | unlink("$conf{spool}/jstat/$jid.stat"); 3526 | unlink("$conf{spool}/jstat/$jid.held"); 3527 | unlink("$conf{spool}/jstat/$jid.stat-err"); 3528 | unlink("$conf{spool}/jstat/$jid.sh"); 3529 | unlink("$conf{spool}/jstat/$jid.job"); 3530 | unlink("$conf{spool}/jstat/$jid.dack"); 3531 | unlink("$conf{spool}/jstat/$jid.dumped"); 3532 | } 3533 | } 3534 | 3535 | sub stream_sigh { 3536 | xlog("debug", "stream $$ SIG $_[0]"); 3537 | 3538 | # set the quit flag 3539 | $stream_quit = 1; 3540 | close STDIN; 3541 | close STDOUT; 3542 | close STDERR; 3543 | # don't exit... you still need to send the "quit" signal 3544 | # exit(-1); 3545 | } 3546 | 3547 | sub do_stream { 3548 | # stream results back to execution top 3549 | 3550 | $SIG{INT} = \&stream_sigh; 3551 | $SIG{TERM} = \&stream_sigh; 3552 | $SIG{PIPE} = \&stream_sigh; 3553 | 3554 | # shift off the -X 3555 | shift @ARGV; 3556 | 3557 | my $termio; 3558 | 3559 | # line by line? 3560 | if ($ARGV[0] eq '-I') { 3561 | $termio=1; 3562 | shift @ARGV; 3563 | } 3564 | 3565 | my $connect = shift @ARGV; 3566 | my $key = shift @ARGV; 3567 | my $data = shift @ARGV; 3568 | my $sock = zmq_socket($context, ZMQ_DEALER); 3569 | 3570 | zmq_setsockopt($sock, ZMQ_LINGER, 1000); 3571 | zmq_setsockopt($sock, ZMQ_HWM, 100); 3572 | # zmq_setsockopt($sock, ZMQ_RCVTIMEO, 1000); 3573 | 3574 | xlog("debug", "stream $$ $connect $key"); 3575 | 3576 | zmq_connect($sock,"tcp://$connect"); 3577 | 3578 | xlog("debug", "stream sready $$ $connect $key"); 3579 | 3580 | zmq_send($sock, "", 0, ZMQ_SNDMORE); 3581 | zmq_send($sock, packcmd('sready', $key)); 3582 | 3583 | my $ready=0; 3584 | my $time=time(); 3585 | my $wait = 5; 3586 | while (!$stream_quit && !$ready) { 3587 | my $got = 0; 3588 | zmq_poll([{ 3589 | socket=>$sock, events=>ZMQ_POLLIN, callback=> sub { 3590 | my $ignore = zmq_recvmsg($sock); 3591 | my $msg = zmq_recvmsg($sock); 3592 | my $data = zmq_msg_data($msg); 3593 | $ready = $data=~/ready|quit/; 3594 | $stream_quit = $data=~/quit/; 3595 | $got = 1; 3596 | }}],1000); 3597 | if (!$stream_quit && !$got && (time() > ($time+$wait))) { 3598 | # ask again ... are you ready for the stream 3599 | zmq_send($sock, "", 0, ZMQ_SNDMORE); 3600 | zmq_send($sock, packcmd('sready', $key)); 3601 | $time=time(); 3602 | $wait += 5; 3603 | if ($wait > 3600) { 3604 | xlog("debug", "stream abandon $$ $key\n"); 3605 | exit(0); 3606 | } 3607 | } 3608 | } 3609 | 3610 | xlog("debug", "stream response $$ $key $data\n"); 3611 | 3612 | my $sent = 1; 3613 | while(!$stream_quit) { 3614 | if ($sent) { 3615 | if ($termio) { 3616 | # line by line 3617 | $_=<>; 3618 | } else { 3619 | # block by block 3620 | read STDIN, $_, 4096; 3621 | } 3622 | if ($_ eq "") { 3623 | $stream_quit = 1 ; 3624 | last; 3625 | } 3626 | $sent=0; 3627 | } 3628 | zmq_poll([{ 3629 | socket=>$sock, events=>ZMQ_POLLOUT, callback=> sub { 3630 | my $ret; 3631 | if ($ret=zmq_send($sock, "", 0, ZMQ_SNDMORE)) { 3632 | xlog("debug", "stream error $ret $$ $key : $?/$!\n"); 3633 | } else { 3634 | if (($ret=zmq_send($sock, packcmd('stream', $key, $_))) && $? ) { 3635 | xlog("debug", "stream error $ret $$ $key : $?/$!\n"); 3636 | } else { 3637 | # ok, that chunk of data went out 3638 | $sent=1; 3639 | } 3640 | } 3641 | }}]); 3642 | } 3643 | 3644 | # let daddy know we're done 3645 | 3646 | zmq_send($sock, "", 0, ZMQ_SNDMORE); 3647 | zmq_send($sock, packcmd("stream", "$key:end")); 3648 | zmq_close($sock); 3649 | zmq_term($context); 3650 | 3651 | xlog("debug", "stream exit $$ $key\n"); 3652 | 3653 | exit(0); 3654 | } 3655 | 3656 | sub debugging { 3657 | return $conf{log_types}->{"debug"} 3658 | } 3659 | 3660 | sub getjobstathash { 3661 | my ($jid)=@_; 3662 | if ( -s "$conf{spool}/jstat/$jid.stat" ) { 3663 | my $stat = unpack_file("$conf{spool}/jstat/$jid.stat"); 3664 | if ( $stat && -e "$conf{spool}/jstat/$jid.dumped" ) { 3665 | $stat->{dumped}=slurp("$conf{spool}/jstat/$jid.dumped"); 3666 | } 3667 | return $stat; 3668 | } 3669 | if ( -s "$conf{spool}/jstat/$jid.stat-err" ) { 3670 | my $stat = unpack_file("$conf{spool}/jstat/$jid.stat-err"); 3671 | if ( $stat && -e "$conf{spool}/jstat/$jid.dumped" ) { 3672 | $stat->{dumped}=slurp("$conf{spool}/jstat/$jid.dumped"); 3673 | } 3674 | return $stat; 3675 | } 3676 | return undef; 3677 | } 3678 | 3679 | sub forkandgo { 3680 | my ($zid, $sub, @args) = @_; 3681 | 3682 | print("FORKING\n"); 3683 | 3684 | if (!(my $pid=fork)) { 3685 | my $out=""; 3686 | my $err=""; 3687 | if (! defined $pid) { 3688 | $err = "Error: Can't fork"; 3689 | sendcmd($conf{bind}, $conf{port}, 'frep', {zid=>unpack("h*",$zid), out=>$err}); 3690 | } else { 3691 | eval { 3692 | zmq_fork_undef(); 3693 | $daemon=0; 3694 | $context=zmq_init(); 3695 | $0="GRUN:fork-handler"; 3696 | # no accidental messages go out on this context 3697 | $out=eval{&$sub(@args)}; 3698 | $err=$@; 3699 | $out = $err if $err && !$out; 3700 | while(length($out)>100000) { 3701 | sendcmd($conf{bind}, $conf{port}, 'frep', {zid=>unpack("h*",$zid), out=>substr($out,0,100000,""), more=>1}); 3702 | } 3703 | sendcmd($conf{bind}, $conf{port}, 'frep', {zid=>unpack("h*",$zid), out=>$out}); 3704 | }; 3705 | exit(0); 3706 | } 3707 | } 3708 | } 3709 | 3710 | sub archive_job { 3711 | my ($jid, $job, $status, $ip, $usage) = @_; 3712 | 3713 | # double-check 3714 | carp "Need a job ref" unless ref($job); 3715 | carp "Need a usage ref" unless !defined($usage) || ref($usage); 3716 | 3717 | $status+=0; 3718 | if ($usage) { 3719 | delete $usage->{ip}; 3720 | delete $usage->{id}; 3721 | $usage->{status} = $status if defined $status; 3722 | $job->{usage} = $usage; 3723 | } else { 3724 | $job->{usage}->{status} = $status if defined($status); 3725 | } 3726 | 3727 | $job->{status} = $status; 3728 | if ($ip) { 3729 | $job->{host} = $nodes{$ip} && $nodes{$ip}->{hostname} ? $nodes{$ip}->{hostname} : $ip; 3730 | $job->{hostip} = $ip; 3731 | } 3732 | 3733 | for (keys(%{$j_wait{$jid}})) { 3734 | replymsg($_, $job); 3735 | } 3736 | delete $j_wait{$jid}; 3737 | 3738 | my $jhistfile=jhistpath($jid); 3739 | xlog("debug", "Writing history for $jid to $jhistfile"); 3740 | open(ST, ">$jhistfile"); 3741 | print ST packfile($job); 3742 | close ST; 3743 | 3744 | xlog("debug", "Unlinking $conf{spool}/jobs/$jid:$ip.run, $conf{spool}/jobs/$jid.ip, $conf{spool}/jobs/$jid"); 3745 | 3746 | if (!$ip) { 3747 | $ip = slurp("$conf{spool}/jobs/$jid.ip"); 3748 | } 3749 | unlink("$conf{spool}/jobs/$jid:$ip.run"); 3750 | unlink("$conf{spool}/jobs/$jid.ip"); 3751 | unlink("$conf{spool}/jobs/$jid.ok"); 3752 | unlink("$conf{spool}/jobs/$jid"); 3753 | if ($job->{guid}) { 3754 | # can't query by guid anymore, except maybe in a history database 3755 | unlink("$conf{spool}/guids/$job->{guid}"); 3756 | } 3757 | 3758 | if ($start_wait{$jid}) { 3759 | # info needed for status/stdio collection from execution node 3760 | replymsg($start_wait{$jid}->{zid},jid=>$jid, status=>$status, error=>$job->{error}, hostname=>$job->{host}, ip=>$job->{host}?$job->{host}:"n/a"); 3761 | delete $start_wait{$jid}; 3762 | } 3763 | } 3764 | 3765 | my $coder; 3766 | sub pretty_encode { 3767 | $coder = JSON::XS->new->ascii->pretty->canonical->allow_blessed unless defined $coder; 3768 | if (@_ > 1 || !ref(@_[0])) { 3769 | $coder->encode([@_]); 3770 | } else { 3771 | $coder->encode(@_[0]); 3772 | } 3773 | } 3774 | 3775 | sub showmem { 3776 | require PadWalker; 3777 | my $o = PadWalker::peek_our(0); 3778 | my $h = PadWalker::peek_my(1); 3779 | for (keys(%$o)) { 3780 | $h->{$_}=$o->{$_}; 3781 | } 3782 | for (keys(%$h)) { 3783 | $h->{$_}=${$h->{$_}} if ref($h->{$_}) eq 'SCALAR'; 3784 | $h->{$_}=${$h->{$_}} if ref($h->{$_}) eq 'REF'; 3785 | } 3786 | return pretty_encode($h); 3787 | } 3788 | 3789 | sub do_execute { 3790 | # shift off the -Y 3791 | shift @ARGV; 3792 | my ($bfile) = @ARGV; 3793 | 3794 | my $opts = unpack_file("$bfile.job"); 3795 | my $uid = $opts->{uid}; 3796 | 3797 | my $code = -1; 3798 | my $start = Time::HiRes::time(); 3799 | 3800 | my $shfile = "$bfile.sh"; 3801 | 3802 | my @cmd = ("bash", $shfile); 3803 | 3804 | open(OLDERR, ">&STDERR"); 3805 | open(OLDOUT, ">&STDOUT"); 3806 | 3807 | close(STDERR); 3808 | close(STDOUT); 3809 | 3810 | my ($out_pid, $err_pid); 3811 | 3812 | $SIG{INT} = $SIG{TERM} = sub { 3813 | kill 2, $out_pid; 3814 | kill 2, $err_pid; 3815 | }; 3816 | 3817 | my ($err, $ret); 3818 | 3819 | my $ok=1; 3820 | eval { 3821 | if ($opts->{out}) { 3822 | if ($opts->{out_a}) { 3823 | $ok&&=open(STDOUT, ">>$opts->{out}"); 3824 | } else { 3825 | $ok&&=open(STDOUT, ">$opts->{out}"); 3826 | } 3827 | if ($opts->{out} eq $opts->{err}) { 3828 | $ok&&=open(STDERR, ">&STDOUT"); 3829 | } 3830 | } 3831 | if ($opts->{err} && !(($opts->{out} eq $opts->{err}))) { 3832 | if ($opts->{err_a}) { 3833 | $ok&&=open(STDERR, ">>$opts->{err}"); 3834 | } else { 3835 | $ok&&=open(STDERR, ">$opts->{err}"); 3836 | } 3837 | } 3838 | if ($opts->{io}) { 3839 | my $confarg = "-C $conf{config}"; # same config! 3840 | my $streamarg = "-X"; # grun streamer 3841 | $streamarg .= " -I" if $opts->{int}; # interactive mode 3842 | if (!$opts->{out}) { 3843 | my $cmd="/usr/bin/perl $GRUN_PATH $confarg $streamarg $conf{bind}:$conf{port} $opts->{id}:out"; 3844 | $out_pid=open(STDOUT, "|$cmd"); 3845 | $ok&&=$out_pid; 3846 | } 3847 | if (!$opts->{err}) { 3848 | my $cmd="/usr/bin/perl $GRUN_PATH $confarg $streamarg $conf{bind}:$conf{port} $opts->{id}:err"; 3849 | $err_pid=open(STDERR, "|$cmd"); 3850 | $ok&&=$err_pid; 3851 | } 3852 | } else { 3853 | # save disk and time, i never want i/o 3854 | if (!$opts->{err}) { 3855 | $ok&&=open(STDERR, ">/dev/null"); 3856 | } 3857 | if (!$opts->{out}) { 3858 | $ok&&=open(STDOUT, ">/dev/null"); 3859 | } 3860 | } 3861 | }; 3862 | 3863 | if ($@ || !$ok) { 3864 | close(STDERR); 3865 | close(STDOUT); 3866 | open(STDERR, ">&OLDERR"); 3867 | open(STDOUT, ">&OLDOUT"); 3868 | if ($@) { 3869 | $err=$@; 3870 | $ret=109; 3871 | } else { 3872 | $err="Error opening i/o files: $!"; 3873 | $ret=109; 3874 | } 3875 | xlog("error", "$ret: $err\n"); 3876 | } 3877 | 3878 | my $elapsed=0; 3879 | 3880 | # deal with nfs sync, if needed 3881 | syncdirs(@{$opts->{syncdirs}}) if $opts->{syncdirs}; 3882 | 3883 | if (!$err) { 3884 | eval { 3885 | $code = system(@cmd); 3886 | $0="GRUN:$opts->{id}:$code"; 3887 | }; 3888 | $elapsed = Time::HiRes::time() - $start; 3889 | } else { 3890 | $code = $ret ? $ret : -1; 3891 | } 3892 | 3893 | open(SAVERR, ">&STDERR"); 3894 | open(SAVOUT, ">&STDOUT"); 3895 | 3896 | open(STDERR, ">&OLDERR"); 3897 | open(STDOUT, ">&OLDOUT"); 3898 | 3899 | xlog("debug", "Job $opts->{id} original exit code is $code"); 3900 | 3901 | $code = ($code == -1) ? $code : ($code & 127) ? $code & 127 : ($code >> 8); 3902 | 3903 | xlog("debug", "Done running job $opts->{id}, code $code as user $uid") if $opts->{trace}; 3904 | 3905 | my ($utime, $stime, 3906 | $maxrss, $ixrss, $idrss, $isrss, $minflt, $majflt, $nswap, 3907 | $inblock, $oublock, $msgsnd, $msgrcv, 3908 | $nsignals, $nvcsw, $nivcsw); 3909 | 3910 | eval { 3911 | ($utime, $stime, 3912 | $maxrss, $ixrss, $idrss, $isrss, $minflt, $majflt, $nswap, 3913 | $inblock, $oublock, $msgsnd, $msgrcv, 3914 | $nsignals, $nvcsw, $nivcsw) = getrusage(RUSAGE_CHILDREN); 3915 | }; 3916 | 3917 | xlog("debug", "Really done with job $opts->{id} (@cmd = $@)\n") if $opts->{trace}; 3918 | 3919 | # $msgsnd, $msgrcv, $nsignals not used in Linux.... 3920 | 3921 | my $out = { 3922 | id=>$opts->{id}, 3923 | status=>$code, 3924 | start_time=>$start, 3925 | utime=>$utime, 3926 | stime=>$stime, 3927 | rtime=>$elapsed, 3928 | pid=>$$, 3929 | maxrss=>$maxrss, 3930 | minflt=>$minflt, 3931 | majflt=>$majflt, 3932 | nswap=>$nswap, 3933 | inblock=>$inblock, 3934 | oublock=>$oublock, 3935 | nvcsw=>$nvcsw, 3936 | nivcsw=>$nivcsw 3937 | }; 3938 | 3939 | $out->{error} = $err if $code && $err; 3940 | 3941 | xlog("debug", "Writing output job $opts->{id}\n") if $opts->{trace}; 3942 | print XOUT packfile($out); 3943 | close XOUT; 3944 | xlog("debug", "Job $opts->{id} exit code $code\n") if $opts->{trace}; 3945 | send_status_for_job($out); 3946 | exit $code; 3947 | # special exit code 101 means couldn't run the command 3948 | close STDOUT; 3949 | close STDERR; 3950 | exit(0); 3951 | } 3952 | 3953 | --------------------------------------------------------------------------------