#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Validate salloc --exclusive with -n will give all cpus on node ############################################################################ # Copyright (C) 2011-2014 SchedMD LLC # Written by Nathan Yee # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc. # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set job_id 0 set nodes "" set cputot 0 set scontrol_cpu 0 set sacct_cpu 0 set gpu_tot 0 set job_tres_cnt 0 set exit_code 0 proc check_alloc { } { global scontrol salloc sacct cputot scontrol_cpu sacct_cpu nodes global job_id number re_word_str set node_name $nodes spawn $scontrol show job $job_id expect { -re "NodeList=($re_word_str)" { set node_name $expect_out(1,string) exp_continue } -re "NumCPUs=($number)" { set scontrol_cpu $expect_out(1,string) exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$scontrol_cpu == 0} { fail "Number of cpus is invalid" } # Wait a bit for sacct to populate sleep 10 spawn $sacct --job=$job_id --allocation -oalloccpus --noheader expect { -re "($number)" { set sacct_cpu $expect_out(1,string) exp_continue } timeout { log_error "sacct is not responding" set exit_code 1 } eof { wait } } if {$sacct_cpu == 0} { fail "Number of cpus is invalid" } spawn $scontrol show node $node_name expect { -re "CPUTot=($number)" { set cputot $expect_out(1,string) exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if {$cputot != $scontrol_cpu} { log_error "scontrol reported $scontrol_cpu cpus were used by job $job_id when it should have used cputot cpus" set exit_code 1 } if {$cputot != $sacct_cpu} { log_error "sacct reported $sacct_cpu cpus were used by job $job_id when it should have used $cputot cpus" set exit_code 1 } } if {[get_config_param "AccountingStorageType"] ne "accounting_storage/slurmdbd"} { skip "Test invalid without slurmdbd" } set node_name [get_nodes_by_request "--gres=gpu:2 -n1 -t1"] if { [llength $node_name] != 1 } { skip "This test need to be able to submit jobs with at least --gres=gpu:2" } if {![param_contains [get_config_param "AccountingStorageTRES"] "gres/gpu"]} { skip "This test requires AccountingStorageTRES=gres/gpu" } # Get the total number of GPUs in the test node set gres_node [get_node_param $node_name "Gres"] set gpu_tot [dict get [count_gres $gres_node] "gpu"] spawn $salloc -t1 -n1 --exclusive $srun -l $bin_printenv SLURMD_NODENAME expect { -re "Granted job allocation ($number)" { set job_id $expect_out(1,string) exp_continue } -re "($number): ($re_word_str)" { set nodes $expect_out(2,string) exp_continue } timeout { log_error "salloc is not responding" set exit_code 1 } eof { wait } } check_alloc spawn $salloc -t1 -n1 --mem=100 --exclusive $srun -l $bin_printenv SLURMD_NODENAME expect { -re "Granted job allocation ($number)" { set job_id $expect_out(1,string) exp_continue } -re "($number): ($re_word_str)" { set nodes $expect_out(2,string) exp_continue } timeout { log_error "salloc is not responding" set exit_code 1 } eof { wait } } check_alloc # # Verify that all GPUs and other GRES are allocated with the --exclusive flag # set job_id 0 spawn $salloc -t1 -n1 -w $node_name --gres=gpu --exclusive $srun -l $bin_printenv SLURMD_NODENAME expect { -re "Granted job allocation ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { log_error "salloc not responding\n" set exit_code 1 } eof { wait } } # Check all GRES of the node were allocated on the job set gres_dict_job [count_gres [get_job_param $job_id "JOB_GRES"]] set gres_dict_node [count_gres [get_node_param $node_name "Gres"]] dict for {gres_name gres_count} $gres_dict_node { if {![dict exists $gres_dict_job $gres_name]} { log_error "Gres $gres_name on node $node_name not allocated on job $job_id with --exclusive" set exit_code 1 } else { set gres_count_job [dict get $gres_dict_job $gres_name] if { $gres_count_job != $gres_count } { log_error "Gres $gres_name on node $node_name not fully allocated on job $job_id with --exclusive ($gres_count_job != $gres_count)" set exit_code 1 } } } if {[wait_for_job $job_id "DONE"] != 0} { log_error "job wasn't able to complete\n" cancel_job $job_id set exit_code 1 } if {$exit_code != 0} { fail "Test failed due to previous errors (\$exit_code = $exit_code)" }