#!/usr/bin/perl -w # # reaver: Kill any processes that are not owned by users with jobs assigned # to this node. # Copyright 2006 Ohio Supercomputer Center # Revision info: # $HeadURL$ # $Revision$ # $Date$ # # Originally intended to be run inside a PBS job epilogue: # all -j$PBS_JOBID -p /usr/local/sbin/reaver [jobid [jobids...]] # where jobid = (jobid to clean up) # # Why is this called "reaver" and not "reaper"? Because I like # the TV show "Firefly", that's why. :) --troy # use strict; use POSIX(); use Getopt::Long; # Turn off output buffering so that "sleep" calls don't cause a delay # on standard output $|=1; # uids below this are assumed to be system accounts my $sysuid_threshold=100; # regexp of usernames with uids above $sysuid_threshold that are allowed # to have processes running on a node without a PBS job my $safelist="arraysvcs|oscmon|decypher"; # Should we attempt to kill strays? (0 = no, 1 = yes) my $kill_mode = 0; # Output is either brief, normal, or verbose. In brief mode, we output only # the PIDs for stray processes (one per line). Normal mode displays "ps" # listing for stray procs. Verbose mode is same as brief mode with extras. my $brief_mode = 0; my $verbose_mode = 0; # Usage/help functions my $USAGE = "$0 [-hkbv] [jobid [jobids...]]"; my $HELP = "Options: -h, --help Print detailed help screen -k, --kill Attempt to kill all stray processes -b, --brief Display only the stray process IDs -v, --verbose Display extra info ID of batch job to remove from safe list "; sub print_usage { print "Usage: $USAGE\n"; } sub print_help { print "Usage: $USAGE\n"; print "$HELP\n"; } sub usage_error { my $MSG = shift || ""; print $MSG; print_usage(); exit(1); } # Process command line options my ($opt_h, $opt_k, $opt_b, $opt_v); Getopt::Long::Configure('bundling'); GetOptions( "h" => \$opt_h, "help" => \$opt_h, "k" => \$opt_k, "kill" => \$opt_k, "b" => \$opt_b, "brief" => \$opt_b, "v" => \$opt_v, "verbose" => \$opt_v ) || usage_error(); if ($opt_h) { print_help(); exit(0); } if ($opt_k) { $kill_mode = 1; } if ($opt_b) { $brief_mode = 1; } if ($opt_v) { $verbose_mode = 1; $brief_mode = 0; } # append to $safelist the names of all users who have jobs currently running # on this host # first, we have to get a list of all the PBS jobids assigned to this host my $hostname=`hostname`; chomp($hostname); my $joblist=undef; # Guess absolute path to command if it isn't already in $PATH eval { no warnings "exec"; open(PBSNODES, "pbsnodes -a |") or open(PBSNODES, "/usr/local/pbs/bin/pbsnodes -a |"); } or die "Cannot run pbsnodes command: $!"; while () { # pbsnodes -a output looks like this # host # indented list of key = value pairs for host # blank line # # so we want to find $hostname in the list and then get the value # of "jobs = " under that chomp; if ( $_ eq $hostname ) { $joblist=""; while () { chomp; my ($key,$value) = split(/ \= /); last unless $key; $key =~ s/^ *//g; if ( $key eq "jobs" ) { $joblist=$value; $joblist =~ s/[0-9]+\///g; $joblist =~ s/\,//g; last; } } last; } } close(PBSNODES); die "pbsnodes failure" if ( ($?>>8)!=0 ); die "Host $hostname not found in pbsnodes output.\n" unless defined($joblist); if ( $joblist ne "" ) { my @alljobs = split(/ /,$joblist); # there may be duplicates in the job list; extract unique values # using method taken from online version of O'Reilly's Perl Cookbook my %seen = (); foreach my $item (@alljobs) { $seen{$item}++; } my @jobs = keys %seen; print "Running jobs: @jobs\n" if $verbose_mode; # remove from @jobs any jobids given on the command line if ( defined($ARGV[0]) ) { my $ptn=""; foreach my $jobid ( @ARGV ) { if ( $ptn eq "" ) { $ptn = $jobid; } else { $ptn .= "|$jobid" } } if ( $ptn ne "" ) { @alljobs=@jobs; @jobs=(); for ( my $i=0; $i<=$#alljobs; $i++ ) { if ( !($alljobs[$i] =~ /^($ptn)/) ) { push(@jobs,$alljobs[$i]); } } } } # now that we have a list of unique jobids, we need to get their # usernames and add them to $safelist print "Safe jobs: " if $verbose_mode; if ( @jobs ) { eval { no warnings "exec"; open(QSTAT,"qstat @jobs |") or open(QSTAT,"/usr/local/pbs/bin/qstat @jobs |"); } or die "Cannot run qstat command: $!"; # skip 1st 2 lines of qstat output -- it's just headers ; ; while () { my ($jobid,$jobname,$user,$time,$status,$queue)=split(/ +/); print "$jobid($user) " if $verbose_mode; if ( ! ($safelist =~ /$user/) ) { if ( $safelist eq "" ) { $safelist = "$user"; } else { $safelist .= "|$user"; } } } close(QSTAT); die "qstat failure" if ( ($?>>8)!=0 ); } print "\n" if $verbose_mode; } else { print "Running jobs: (none)\n" if $verbose_mode; } if ( $safelist ne "" ) { print "Safe users: $safelist\n" if $verbose_mode; } # find all the target pids my @pids=(); open(PS,"ps -A -o pid,state,uid,user,command |"); # snarf the 1st line ; print "Stray processes:\n" if $verbose_mode; while () { chomp; my ($pid,$state,$uid,$user,$command)=split(' ', $_, 5); if ( ($uid > $sysuid_threshold) && !($user =~ /^($safelist)$/) && ($uid!=$>) ) { push(@pids,$pid); if ($brief_mode) { print "$pid\n"; } else { printf "%8d %-3s %-9.9s %-10.50s\n", $pid, $state, $user, $command; } } } if ($verbose_mode) { print " (none)\n" unless @pids; } close(PS); die "ps failure" if ( ($?>>8)!=0 ); # pull the trigger if ( @pids && $kill_mode) { # terminate-with-extreme-prejudice version # the CONT signal is needed to reawaken processes that are # in a breakpoint in a debugger print "Sending SIGCONT...\n" if $verbose_mode; kill(&POSIX::SIGCONT,@pids); sleep(1); print "Sending SIGTERM...\n" if $verbose_mode; kill(&POSIX::SIGTERM,@pids); sleep(5); print "Sending SIGKILL...\n" if $verbose_mode; kill(&POSIX::SIGKILL,@pids); }