#!/bin/bash
#
# dezombify:  Requeue a "running" PBS job that's not actually running
#             because it had JOIN JOB failures.
# Copyright 2006 Ohio Supercomputer Center
#
# License:  GNU GPL v2; see ../COPYING for details.
# Revision info:
# $HeadURL$
# $Revision$
# $Date$
#
# Usage:  dezombify jobid [...jobids...]
#
# Must be run as root
# Requires all -j
# Requires $PBS_HOME to be set
#
# Signs of a zombie job:
# * "qstat -f $jobid" reports job state as R but no CPU time or memory consumed
# * "qps -a $jobid" shows *NO* processes
# * "qtracejob $jobid | grep -c JOIN" gives a number that's less than (N-1),
#    where N==# nodes assigned to the job

# First, make sure we know where to look for the appropriate files to clobber
if [ -z "$PBS_HOME" ]
then
    echo "PBS_HOME not set, exiting!"
    exit -1
elif [ ! -d $PBS_HOME ]
then
    echo "$PBS_HOME not a directory, exiting!"
    exit -2
fi

for job in $*
do
    # Since this will later runs an rm -rf command as root,
    # sanity-check that the jobid is really a running job
    jobid=$(echo $job | sed 's/\..*$//' | egrep '^[0-9]+$')
    if [ -z "$jobid" ]
    then
        echo "Bogus jobid $job!"
    else
        # Make sure job is "running"
        state=$(qstat -r $jobid | grep $jobid | awk '{print $10}')
	if [ "$state" = "R" ]
	then
            # hold the job
            qhold $jobid
    
            # stop all associated pbs_mom's
            all -p -j$jobid /sbin/service pbs_mom stop
    
            # nuke the job files on the nodes
            all -p -j$jobid /bin/rm -rf $PBS_HOME/mom_priv/jobs/$jobid.'*'
    
            # restart the pbs_mom's with -p option
            # can't be done in parallel because the pbs_server can't
            # keep up with all the "I'm back up!" messages
            all -j$jobid /sbin/service pbs_mom pstart \; sleep 1
    
            # "rerun" the job
            # since it's held, this causes the pbs_server to clear its
            # neednodes list and R state
            qrerun $jobid
    
            # release the job
            # if everything went OK, this will return the job to the Q state
            qrls $jobid
	else
	    echo "Job $jobid not in R state!"
	fi
    fi
done