#! /usr/bin/perl # @configure_input@ ################################################################################ # # Usage: restart_script # # This script is invoked by pbs_mom to restart a job. # ################################################################################ use strict; use Sys::Syslog; # # These variables should be initialized by configure. # my $blcr_bindir="@BLCR_BINDIR@"; my $torque_bindir="@bindir@"; my $mom_priv="@PBS_SERVER_HOME@/mom_priv"; # Customize the following variables # Customize the PATH to include location of blcr checkpoint commands # (e.g. cr_restart), torque client commands (e.g. qalter), # and pbs_mom checkpoint/restart scripts (e.g. restart_script,run_on_success) $ENV{PATH} .= ":$blcr_bindir" if $blcr_bindir; $ENV{PATH} .= ":$torque_bindir" if $torque_bindir; $ENV{PATH} .= ":$mom_priv" if $mom_priv; my $logLevel = 3; # 0 = none, 1 = fail, 2 = info, 3 = debug logPrint(2, "Invoked: $0 " . join(' ', @ARGV) . "\n"); my ($sessionId, $jobId, $userId, $groupId, $checkpointDir, $checkpointName); my $usage = "Usage: $0 \n"; if (@ARGV == 6) { ($sessionId, $jobId, $userId, $groupId, $checkpointDir, $checkpointName) = @ARGV; } else { logDie(1, $usage); } # Set of cr_restart hooks for flagging the job with various error conditions my $runOnSuccessCmd = "qalter -W checkpoint_restart_status=\"Successfully restarted job from checkpoint\" $jobId"; my $runOnFailTempCmd = "qalter -W checkpoint_restart_status=\"Temporary failure restarting job from checkpoint\" $jobId"; my $runOnFailPermCmd = "qalter -W checkpoint_restart_status=\"Permanent failure restarting job from checkpoint\" $jobId"; my $runOnFailArgsCmd = "qalter -W checkpoint_restart_status=\"Argument failure restarting job from checkpoint\" $jobId"; my $runOnFailEnvCmd = "qalter -W checkpoint_restart_status=\"Environment failure restarting job from checkpoint\" $jobId"; my $runOnFailureCmd = "qalter -W checkpoint_restart_status=\"General failure restarting job from checkpoint\" $jobId"; # Drop privileges to the job owner my $gid = getgrnam($groupId); logDie(1, "Unable to resolve group id ($groupId)\n") unless defined $gid; $( = $gid; $) = $gid; logDie(1, "Unable to set gid: $gid") unless $gid == $(; my $uid = getpwnam($userId); logDie(1, "Unable to resolve user id ($userId)\n") unless defined $uid; $< = $uid; $> = $uid; logDie(1, "Unable to set uid: $uid") unless $uid == $<; # Change to the checkpoint directory where we want the checkpoint to be created chdir $checkpointDir or logDie(1, "Unable to cd to checkpoint dir ($checkpointDir): $!\n") if $logLevel; my $cmd = "cr_restart"; $cmd .= " --run-on-success='$runOnSuccessCmd'" if $runOnSuccessCmd; $cmd .= " --run-on-fail-perm='$runOnFailPermCmd'" if $runOnFailPermCmd; $cmd .= " --run-on-fail-temp='$runOnFailTempCmd'" if $runOnFailTempCmd; $cmd .= " --run-on-fail-args='$runOnFailArgsCmd'" if $runOnFailArgsCmd; $cmd .= " --run-on-fail-env='$runOnFailEnvCmd'" if $runOnFailEnvCmd; $cmd .= " --run-on-failure='$runOnFailureCmd'" if $runOnFailureCmd; $cmd .= " $checkpointName"; my $restartOutput = `$cmd 2>&1`; my $restartRc = $? >> 8; if ($restartRc) { logPrint(1, "Subcommand ($cmd) failed with rc=$restartRc:\n$restartOutput") if $logLevel >= 1; # Check whether this was a cr_restart error # We need to do this before updating the error with qalter my $cmd = "qstat -as $jobId"; my $output = `$cmd 2>&1`; my $rc = $? >> 8; if ($rc) { logPrint(1, "Subcommand ($cmd) failed with rc=$rc:\n$output") if $logLevel >= 1; } else { logPrint(3, "Subcommand ($cmd) yielded rc=$rc:\n$output") if $logLevel >= 3; } if ($output =~ m/^(.*failure restarting job.*)$/m) { my $comment = "$1: $restartOutput"; # Attach restart error message to job # Before attaching message we first replace carriage returns with spaces $comment =~ s/\n/ /g; # Perl automatically enables a set of special security checks, # called taint mode, when it detects its program running with # differing real and effective user or group IDs. # We must therefore untaint the output. if ($comment =~ m/(.*)/) { $comment = $1; } # Build qalter command my $cmd = "qalter -W comment=\"\\\"$comment\\\"\" $jobId"; my $output = `$cmd 2>&1`; my $rc = $? >> 8; if ($rc) { logPrint(1, "Subcommand ($cmd) failed with rc=$rc:\n$output") if $logLevel >= 1; } else { logPrint(3, "Subcommand ($cmd) yielded rc=$rc:\n$output") if $logLevel >= 3; } } } else { logPrint(3, "Subcommand ($cmd) yielded rc=$restartRc:\n$restartOutput") if $logLevel >= 3; } # Exit with exit code of cr_restart command exit $restartRc; ################################################################################ # logPrint($message) # Write a message (to syslog) and die ################################################################################ sub logPrint { my ($level, $message) = @_; my @severity = ('none', 'warning', 'info', 'debug'); return if $logLevel < $level; openlog('restart_script', '', 'user'); syslog($severity[$level], $message); closelog(); } ################################################################################ # logDie($message) # Write a message (to syslog) and die ################################################################################ sub logDie { my ($level, $message) = @_; logPrint($level, $message); die($message); }