#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test that the core spec option in sbatch allocates the correct # number of cores and that tasks spread over multiple nodes # when there is not enough resources on one node. # # # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ # Copyright (C) 2014 SchedMD LLC # Written by Nathan Yee # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set test_id "17.34" set file_in "test$test_id\.in" set file_out "test$test_id\.out" set spec_in "spec_core_script\.in" set exit_code 0 ############################################################################# # # Checks that the node uses the correct number of specialized cores # and that the number of nodes the job uses is correct. # # exp_node = 0: job must only use the specified node # exp_node = 1: job must use more then specified node # exp_node = -1: job must fail because the job exceeds the number of cores # ############################################################################# proc core_spec_job {task node core_spec exp_nodes} { global sbatch scontrol spec_in file_out number thread_cnt exit_code global cpu_tot set job_id 0 set num_nodes 0 # Determine the number of tasks that can be run set cpu_used_by_spec [expr $thread_cnt * $core_spec] if {$cpu_tot > $cpu_used_by_spec} { set task_limit [expr $cpu_tot - $cpu_used_by_spec] } else { set task_limit 1 } set ntasks [expr abs($task_limit + $task)] if {$ntasks == 0} { set ntasks 1 } set error_chk 0 spawn $sbatch -t1 -w$node -S$core_spec -n$ntasks -o$file_out $spec_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } -re "error" { if {$exp_nodes != -1} { send_user "\nFAILURE: sbatch should not have produced an error\n" set exit_code 1 } set error_chk 1 exp_continue } timeout { send_user "\nFAILURE: sbatch is not responding\n" set exit_code 1 } eof { wait } } if {$job_id == 0 && $error_chk == 0} { send_user "\nFAILURE: Job was not submitted\n" exit 1 } elseif {$exp_nodes == -1 && $job_id != 0} { send_user "\nFAILURE: This job should have failed but did not\n" exit 1 } elseif {$exp_nodes == -1 && $error_chk != 0} { send_user "\nThis error is expected do not worry\n" } else { set core_chk 0 if {[wait_for_job $job_id "RUNNING"] != 0} { send_user "\nFAILURE: waiting for job to start\n" set exit_code 1 } spawn $scontrol show job $job_id expect { -re "NumNodes=($number)" { set num_nodes $expect_out(1,string) exp_continue } -re "CoreSpec=$core_spec" { set core_chk 1 exp_continue } timeout { send_user "\nFAILURE: scontrol is not responding\n" set exit_code 1 } eof { wait } } if {$core_chk == 0} { send_user "\nFAILURE: Job $job_id does not have the correct number of specialized cores\n" set exit_code 1 } if {[wait_for_job $job_id "DONE"] != 0} { send_user "\nFAILURE: waiting for job to complete\n" set exit_code 1 } } if {$exp_nodes == 1} { if {$num_nodes <= 1} { send_user "\nFAILURE: Job $job_id should use more then 1 node\n" set exit_code 1 } } if {$exp_nodes == 0} { if {$num_nodes != 1} { send_user "\nFAILURE: Job $job_id should use only $node\n" set exit_code 1 } } } ############################################################################# # # Tests begin here # ############################################################################# print_header $test_id if {[test_linear]} { send_user "\nWARNING: This test is incompatible with select/linear\n" exit 0 } if {[test_select_type_params "CR_SOCKET"]} { send_user "\nWARNING: This test is incompatible with CR_SOCKET allocations\n" exit 0 } set allow_spec [test_allow_spec_resc] if {$allow_spec == 0} { send_user "WARNING: AllowSpecResourcesUsage not configured to permit core specialization\n" exit $exit_code } # Remove any vestigial files exec $bin_rm -f $file_in $file_out $spec_in make_bash_script $file_in " first=\$($scontrol show hostnames \$SLURM_JOB_NODELIST\ | head -n1)\ $scontrol show node \$first\ " make_bash_script $spec_in "sleep 5" set job_id 0 spawn $sbatch --exclusive -t1 -N2 -o$file_out $file_in expect { -re "Batch job submission failed" { send_user "\nWARNING: can't test srun task distribution\n" exit $exit_code } -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: sbatch is not responding\n" set exit_code 1 } eof { wait } } if {$job_id == 0} { send_user "FAILURE: sbatch did not submit job\n" exit 1 } if {[wait_for_file $file_out] != 0} { send_user "\nFAILURE: output file was not created\n" exit 1 } set first_node "" set core_cnt 0 set cpu_tot 1 set socket_cnt 1 set thread_cnt 1 spawn $bin_cat $file_out expect { -re "NodeName=($alpha_numeric_under)" { set first_node $expect_out(1,string) exp_continue } -re "CoresPerSocket=($number)" { set core_cnt $expect_out(1,string) exp_continue } -re "CPUTot=($number)" { set cpu_tot $expect_out(1,string) exp_continue } -re "Sockets=($number)" { set socket_cnt $expect_out(1,string) exp_continue } -re "ThreadsPerCore=($number)" { set thread_cnt $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: cat is not responding\n" set exit_code 1 } eof { wait } } set core_cnt [expr $core_cnt * $socket_cnt] if {$core_cnt == 0} { send_user "\nFAILURE: sbatch did not find the number of cores\n" exit 1 } if {$core_cnt < 4} { send_user "\nWARNING: core count too low for testing ($core_cnt < 4)\n" exit $exit_code } # # Using the core spec within the node limits # send_user "\n\nRun within the specified node\n" core_spec_job 0 $first_node [expr $core_cnt - 2] 0 core_spec_job -2 $first_node [expr $core_cnt - 2] 0 # # Using core spec with more tasks then the node can handle. This should # cause the tasks to spread across multiple nodes as needed # send_user "\n\nSpread job across multiple nodes\n" core_spec_job 1 $first_node [expr $core_cnt - 2] 1 core_spec_job 1 $first_node [expr $core_cnt - 1] 1 # # Using core spec with more cores then the specified node has # send_user "\n\nFail by trying to use more cores than exist\n" core_spec_job 1 $first_node [expr $core_cnt + 5] -1 core_spec_job 1 $first_node [expr $core_cnt + 7] -1 if {$exit_code == 0} { send_user "\nSUCCESS\n" exec $bin_rm -f $file_in $file_out $spec_in } exit $exit_code