#!/bin/sh # Do "ps aux" on a set of nodes belonging to a single job, but exclude system processes. # Usage: sshjob job-id PS="/bin/ps" PSFLAGS="-o pid,state,user,start,cputime,%cpu,rssize,command --columns 100" SSH="ssh -n -x" PING="/bin/ping -c 1 -w 3" QSTAT=/usr/local/bin/qstat # System users excluded from the list USERLIST="root rpc rpcuser daemon ntp smmsp sshd hpsmh named dbus" # Temporary file JOBSTATUS=/tmp/jobstatus.$$ # Catch signals trap "rm -f $JOBSTATUS; exit 2" 1 2 3 14 15 19 # Check command arguments if test $# -ne 1 then echo Usage: $0 job-id exit 1 fi JOB=$1 # Check if this job-ID can be inquired successfully. # (The qstat flag "-1" is only available from Torque 2.1) $QSTAT -f -1 $JOB > $JOBSTATUS if test "$?" != "0" then echo Error inquiring about job $JOB exit 1 fi # Check the job state JOBSTATE="`cat $JOBSTATUS | grep job_state | awk '{print $3}'`" if test "$JOBSTATE" != "R" then echo The job $JOB is not running, it has state=$JOBSTATE exit 1 fi # Get the Torque resource exec_host # Replace "+" by newline for "+"-separated nodelists from Torque # Print only unique nodenames ("uniq" command) because SMP nodes may be repeated NODELIST="`cat $JOBSTATUS | grep exec_host | awk '{print $3}'| sed -e 's/\/.//g' -e 's/+/\n/g' | uniq`" if test -z "$NODELIST" then echo Error: The node list is empty exit 1 fi # Get the number of nodes and node properties used NODES="`cat $JOBSTATUS | grep Resource_List.nodes | awk '{print $3}'`" echo This job uses $NODES nodes echo Nodelist for job-id $JOB: $NODELIST # Loop over nodes and execute the "ps" command for node in $NODELIST do echo '----- Node' $node '-----' if $PING $node 2>&1 > /dev/null then $SSH $node $PS $PSFLAGS --deselect -u \""$USERLIST"\" else echo '*** WARNING ***' Cannot ping host ${node} ! fi done rm -f $JOBSTATUS