#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test exclusive resource allocation for a step (--exclusive option). # # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ # Copyright (C) 2007 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set test_id "1.14" set exit_code 0 set file_in "test$test_id.input" set file_in2 "test$test_id.input2" set file_out "test$test_id.output" set file_out2 "test$test_id.output2" set job_id 0 set gpu_tot 0 set job_tres_cnt 0 set hostname "" print_header $test_id if {[test_front_end]} { send_user "\nWARNING: This test is incompatible with front-end systems\n" exit $exit_code } if {![get_node_config 2]} { log_warn "This test requires 2 or more GPUs in a node of the default partition. Skipping.\n" exit $exit_code } # # Delete left-over input script # Build input script file # Run one more step than allocated CPUs and make sure it waits # The "sleep 4" is meant to ensure the earlier job steps start first # exec $bin_rm -f $file_in $file_in2 $file_out $file_out2 make_bash_script $file_in " echo tasks_per_node=\$SLURM_TASKS_PER_NODE if \[ \$SLURM_TASKS_PER_NODE -gt 32 \]; then sleep_secs=45 else sleep_secs=10 fi inx=0 while \[ \$inx -lt \$SLURM_TASKS_PER_NODE \] do $srun --exclusive -n1 $bin_sleep \$sleep_secs & inx=\$((inx+1)) done $bin_sleep 4 $srun -v --exclusive -n1 ./$file_in2 & wait " make_bash_script $file_in2 " $scontrol show steps " # # Spawn a job via sbatch # spawn $sbatch -N1 -t2 --gres=craynetwork:0 --output=$file_out ./$file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: sbatch not responding\n" set exit_code 1 exp_continue } eof { wait } } if { $job_id == 0 } { send_user "\nFAILURE: failed to submit job\n" exit 1 } # # Wait for job to complete # if {[wait_for_job $job_id "DONE"] != 0} { send_user "\nFAILURE: waiting for job to complete\n" cancel_job $job_id set exit_code 1 } # # Check for desired output # if {[wait_for_file $file_out] != 0} { send_user "\nFAILURE: Output file $file_out is missing\n" exit 1 } set matches 0 set tasks_per_node 0 spawn $bin_cat $file_out expect { -re "tasks_per_node=($number)" { set tasks_per_node $expect_out(1,string) } -re "StepId=$job_id" { incr matches exp_continue } eof { wait } } if { $matches > $tasks_per_node } { send_user "\nFAILURE: Problem with exclusive resource allocation " send_user "for step ($matches > $tasks_per_node)\n" set exit_code 1 } if {$exit_code == 0} { send_user "\nSo far, so good. Trying with --immediate option\n\n" } else { exit $exit_code } # # Delete left-over input script # Build another input script file # Run one more step than allocated CPUs with immediate option and make aborts # The "sleep" is meant to ensure the earlier job steps start first # exec $bin_rm -f $file_in $file_out make_bash_script $file_in " inx=0 if \[ \$SLURM_TASKS_PER_NODE -gt 32 \]; then sleep_secs=45 else sleep_secs=10 fi while \[ \$inx -lt \$SLURM_TASKS_PER_NODE \] do $srun --exclusive -n1 --mem=0 $bin_sleep \$sleep_secs & inx=\$((inx+1)) done $bin_sleep 4 $srun -v --exclusive -n1 --mem=0 --immediate ./$file_in2 & wait " # # Spawn a job via sbatch # spawn $sbatch -N1 -t2 --gres=craynetwork:0 --output=$file_out2 ./$file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: sbatch not responding\n" set exit_code 1 exp_continue } eof { wait } } if { $job_id == 0 } { send_user "\nFAILURE: failed to submit job\n" exit 1 } # # Wait for job to complete # if {[wait_for_job $job_id "DONE"] != 0} { send_user "\nFAILURE: waiting for job to complete\n" cancel_job $job_id set exit_code 1 } # # Check for desired output # if {[wait_for_file $file_out2] != 0} { send_user "\nFAILURE: Output file $file_out2 is missing\n" exit 1 } set matches 0 spawn $bin_cat $file_out2 expect { -re "StepId=$job_id" { send_user "\nFAILURE: Problem --exclusive and --immediate option for step\n" set exit_code 1 exp_continue } -re "Unable to create " { send_user "This error was expected, no worries\n" incr matches exp_continue } eof { wait } } if { $matches != 1 } { send_user "\nFAILURE: Problem --exclusive and --immediate option for step ($matches != 1)\n" set exit_code 1 } # # Verify that all GPUs are allocated with the --exclusive flag # set job_id 0 spawn $srun -N1 -w $hostname --gres=gpu --exclusive $bin_printenv SLURM_JOBID expect { -re "($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "srun not responding\n" set exit_code 1 } eof { wait } } set job_gpu_cnt [get_job_gpu_cnt $job_id] if {$gpu_tot != $job_gpu_cnt} { log_error "Job didn't allocate all GPUs ($gpu_tot != $job_gpu_cnt)" set exit_code 1 } else { log_info "Node GPU count matches Job GPU count ($gpu_tot == $job_gpu_cnt)" } if {[wait_for_job $job_id "DONE"] != 0} { log_error "job wasn't able to complete\n" cancel_job $job_id set exit_code 1 } if {$exit_code == 0} { exec $bin_rm -f $file_in $file_in2 $file_out $file_out2 send_user "\nSUCCESS\n" } exit $exit_code