#!/usr/bin/perl -w # # reaver: Kill any processes that are not owned by users with jobs assigned # to this node. # Copyright 2006, 2007, 2011, 2012, 2015 Ohio Supercomputer Center # # License: GNU GPL v2; see ../COPYING for details. # Revision info: # $HeadURL$ # $Revision$ # $Date$ # # Originally intended to be run inside a PBS job epilogue: # all -j$PBS_JOBID -p /usr/local/sbin/reaver # where jobid = (jobid to clean up) # # Why is this called "reaver" and not "reaper"? Because I like # the TV show "Firefly", that's why. :) --troy # use strict; use POSIX(); use Getopt::Long; use diagnostics; # Turn off output buffering so that "sleep" calls don't cause a delay # on standard output $|=1; # uids below this are assumed to be system accounts my $sysuid_threshold=100; # regexp of usernames with uids above $sysuid_threshold that are allowed # to have processes running on a node without a PBS job my $safelist="arraysvcs|oscmon|decypher|rtkit|nagios"; # Should we attempt to kill strays? (0 = no, 1 = yes) my $kill_mode = 0; # Clean up after a particular job (i.e. don't whitelist its owner) my $reap_job; # Output is either brief, normal, or verbose. In brief mode, we output only # the PIDs for stray processes (one per line). Normal mode displays "ps" # listing for stray procs. Verbose mode is same as brief mode with extras. my $brief_mode = 0; my $verbose_mode = 0; my $long_ps_mode = 1; # Usage/help functions my $USAGE = "$0 [-bchklv] [-j ]"; my $HELP = "Options: -b, --brief Display only the stray process IDs -c, --classic Display \"classic\" summary of process information -h, --help Print detailed help screen -j , --job= Clean up processes associated with -k, --kill Attempt to kill all stray processes -l, --long Display long summary of process information (default) -v, --verbose Display extra info "; # Find $PBS_HOME my $spool = undef; if ( defined($ENV{"PBS_HOME"}) ) { $spool=$ENV{"PBS_HOME"}; } else { my @defaults=("/usr/spool/PBS", "/var/spool/pbs", "/var/spool/torque", "/var/spool/batch/pbs", "/var/spool/batch/torque", "/var/spool/batch/pbs-piv", "/var/spool/batch/pbs-ipf", ".", ); foreach my $dir ( @defaults ) { if ( -d $dir ) { $spool=$dir; last; } } } if ( !defined($spool) ) { die "Unable to find PBS spool directory!\n"; } sub print_usage { print "Usage: $USAGE\n"; } sub print_help { print "Usage: $USAGE\n"; print "$HELP\n"; } sub usage_error { my $MSG = shift || ""; print $MSG; print_usage(); exit(1); } # Process command line options my ($opt_b, $opt_c, $opt_h, $opt_j, $opt_k, $opt_l, $opt_v); Getopt::Long::Configure('bundling'); GetOptions( "b" => \$opt_b, "brief" => \$opt_b, "c" => \$opt_c, "classic" => \$opt_c, "h" => \$opt_h, "help" => \$opt_h, "j=s"=> \$opt_j, "job=s" => \$opt_j, "k" => \$opt_k, "kill" => \$opt_k, "l" => \$opt_l, "long" => \$opt_l, "v" => \$opt_v, "verbose" => \$opt_v ) || usage_error(); if ($opt_h) { print_help(); exit(0); } if ($opt_b) { $brief_mode = 1; } if ($opt_c) { $long_ps_mode = 0; } if ($opt_j) { $reap_job = $opt_j; $reap_job =~ s/\..*$//; } if ($opt_k) { $kill_mode = 1; } if ($opt_l) { $long_ps_mode = 1; } if ($opt_v) { $verbose_mode = 1; $brief_mode = 0; } # Append to $safelist the names of all users who have jobs currently running # on this host # The following approach doesn't put any load on the PBS server, but forces # reaver to be run by root my $ptn = "$spool/mom_priv/jobs/*.JB"; if ( glob($ptn) ) { foreach my $jobfile (glob($ptn)) { if ( ! defined($reap_job) || ! $jobfile=~/\/$reap_job\..*\.JB$/ || ! $jobfile=~/\/$reap_job\.JB$/ ) { my $jobuserlist; # TORQUE 4(?) and after have XML .JB files; earlier versions # are binary my $isxml = `head -1 $jobfile | grep -ic xml`; chop($isxml); if ( $isxml==1 ) { $jobuserlist = `grep euser $jobfile | sed 's/ *//g' | sed 's:::'`; } else { $jobuserlist = `cat $jobfile | strings | grep -A 1 euser | grep -v euser | sort | uniq | tr '\n' '|'`; } chop($jobuserlist); if ( $jobuserlist ne "" ) { if ( $safelist eq "" ) { $safelist = $jobuserlist; } else { $safelist .= "|".$jobuserlist; } } } } } # Append to $safelist the owner of /dev/console my $consoleowner = `stat -c %U /dev/console`; chop($consoleowner); if ( $consoleowner ne "root" ) { if ( $safelist eq "" ) { $safelist = $consoleowner; } else { $safelist .= "|".$consoleowner; } } if ( $safelist ne "" ) { print "Safe users: $safelist\n" if $verbose_mode; } # find all the target pids my @pids=(); if ( $long_ps_mode ) { open(PS,"ps -w -A -o pid,%cpu,%mem,state,uid,user,lstart,command |"); } else { open(PS,"ps -A -o pid,state,uid,user,command |"); } # snarf the 1st line ; print "Stray processes:\n" if $verbose_mode; while () { my ($pid,$p_cpu,$p_mem,$state,$uid,$user,$lstart,$command); chomp; if ($long_ps_mode ) { ($pid,$p_cpu,$p_mem,$state,$uid,$user,$lstart,$command) = split(' ', $_, 8); } else { ($pid,$state,$uid,$user,$command) = split(' ', $_, 5); } if ( ($uid > $sysuid_threshold) && !($user =~ /^($safelist)$/) && ($uid!=$>) ) { push(@pids,$pid); if ($brief_mode) { print "$pid\n"; } else { if ($long_ps_mode ) { print $pid," ",$p_cpu," ",$p_mem," ",$state," ",$user," ",$lstart," ",$command,"\n"; } else { printf "%8d %-3s %-9.9s %-10.50s\n", $pid, $state, $user, $command; } } } } if ($verbose_mode) { print " (none)\n" unless @pids; } close(PS); die "ps failure" if ( ($?>>8)!=0 ); # pull the trigger if ( @pids && $kill_mode) { # terminate-with-extreme-prejudice version # the CONT signal is needed to reawaken processes that are # in a breakpoint in a debugger print "Sending SIGCONT...\n" if $verbose_mode; kill(&POSIX::SIGCONT,@pids); sleep(1); print "Sending SIGTERM...\n" if $verbose_mode; kill(&POSIX::SIGTERM,@pids); sleep(5); print "Sending SIGKILL...\n" if $verbose_mode; kill(&POSIX::SIGKILL,@pids); }