#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test exclusive resource allocation for a step (--exclusive option). ############################################################################ # Copyright (C) 2007 The Regents of the University of California. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set exit_code 0 set file_in "test$test_id.input" set file_in2 "test$test_id.input2" set file_out "test$test_id.output" set file_out2 "test$test_id.output2" set job_id 0 set gpu_tot 0 set job_tres_cnt 0 if {[get_config_param "FrontendName"] ne "MISSING"} { skip "This test is incompatible with front-end systems" } set node_name [get_nodes_by_request "--gres=gpu:2 -n1 -t1"] if { [llength $node_name] != 1 } { skip "This test need to be able to submit jobs with at least --gres=gpu:2" } if {![param_contains [get_config_param "AccountingStorageTRES"] "gres/gpu"]} { skip "This test requires AccountingStorageTRES=gres/gpu" } # Get the total number of GPUs in the test node set gres_node [get_node_param $node_name "Gres"] set gpu_tot [dict get [count_gres $gres_node] "gpu"] # # Delete left-over input script # Build input script file # Run one more step than allocated CPUs and make sure it waits # The "sleep 4" is meant to ensure the earlier job steps start first # exec $bin_rm -f $file_in $file_in2 $file_out $file_out2 make_bash_script $file_in " echo tasks_per_node=\$SLURM_TASKS_PER_NODE if \[ \$SLURM_TASKS_PER_NODE -gt 32 \]; then sleep_secs=45 else sleep_secs=10 fi inx=0 while \[ \$inx -lt \$SLURM_TASKS_PER_NODE \] do $srun --exclusive -n1 $bin_sleep \$sleep_secs & inx=\$((inx+1)) done $bin_sleep 4 $srun -v --exclusive -n1 ./$file_in2 & wait " make_bash_script $file_in2 " $scontrol show steps " # # Spawn a job via sbatch # spawn $sbatch -N1 -t2 --gres=craynetwork:0 --output=$file_out ./$file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "sbatch not responding" set exit_code 1 exp_continue } eof { wait } } if { $job_id == 0 } { fail "Failed to submit job" } # # Wait for job to complete # if {[wait_for_job $job_id "DONE"] != 0} { log_error "Waiting for job to complete" cancel_job $job_id set exit_code 1 } # # Check for desired output # if {[wait_for_file $file_out] != 0} { fail "Output file $file_out is missing" } set matches 0 set tasks_per_node 0 spawn $bin_cat $file_out expect { -re "tasks_per_node=($number)" { set tasks_per_node $expect_out(1,string) } -re "StepId=$job_id" { incr matches exp_continue } eof { wait } } if { $matches > $tasks_per_node } { log_error "Problem with exclusive resource allocation for step ($matches > $tasks_per_node)" set exit_code 1 } if {$exit_code == 0} { log_info "So far, so good. Trying with --immediate option" } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" } # # Delete left-over input script # Build another input script file # Run one more step than allocated CPUs with immediate option and make aborts # The "sleep" is meant to ensure the earlier job steps start first # exec $bin_rm -f $file_in $file_out make_bash_script $file_in " inx=0 if \[ \$SLURM_TASKS_PER_NODE -gt 32 \]; then sleep_secs=45 else sleep_secs=10 fi while \[ \$inx -lt \$SLURM_TASKS_PER_NODE \] do $srun --exclusive -n1 --mem=0 $bin_sleep \$sleep_secs & inx=\$((inx+1)) done $bin_sleep 4 $srun -v --exclusive -n1 --mem=0 --immediate ./$file_in2 & wait " # # Spawn a job via sbatch # spawn $sbatch -N1 -t2 --gres=craynetwork:0 --output=$file_out2 ./$file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "sbatch not responding" set exit_code 1 exp_continue } eof { wait } } if { $job_id == 0 } { fail "Failed to submit job" } # # Wait for job to complete # if {[wait_for_job $job_id "DONE"] != 0} { log_error "Waiting for job to complete" cancel_job $job_id set exit_code 1 } # # Check for desired output # if {[wait_for_file $file_out2] != 0} { fail "Output file $file_out2 is missing" } set matches 0 spawn $bin_cat $file_out2 expect { -re "StepId=$job_id" { log_error "Problem --exclusive and --immediate option for step" set exit_code 1 exp_continue } -re "Unable to create " { log_debug "This error was expected, no worries" incr matches exp_continue } eof { wait } } if { $matches != 1 } { log_error "Problem --exclusive and --immediate option for step ($matches != 1)" set exit_code 1 } # # Verify that all GPUs and other GRES are allocated with the --exclusive flag # set job_id 0 spawn $srun -N1 -w $node_name --gres=gpu --exclusive $bin_printenv SLURM_JOBID expect { -re "($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "srun not responding\n" set exit_code 1 } eof { wait } } # Check all GRES of the node were allocated on the job set gres_dict_job [count_gres [get_job_param $job_id "JOB_GRES"]] set gres_dict_node [count_gres [get_node_param $node_name "Gres"]] dict for {gres_name gres_count} $gres_dict_node { if {![dict exists $gres_dict_job $gres_name]} { log_error "Gres $gres_name on node $node_name not allocated on job $job_id with --exclusive" set exit_code 1 } else { set gres_count_job [dict get $gres_dict_job $gres_name] if { $gres_count_job != $gres_count } { log_error "Gres $gres_name on node $node_name not fully allocated on job $job_id with --exclusive ($gres_count_job != $gres_count)" set exit_code 1 } } } if {[wait_for_job $job_id "DONE"] != 0} { log_error "job wasn't able to complete\n" cancel_job $job_id set exit_code 1 } if {$exit_code == 0} { exec $bin_rm -f $file_in $file_in2 $file_out $file_out2 } else { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }