#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # sacctmgr add an account to this cluster and try using it with # salloc, sbatch and srun. We also test limits here as well. # # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ # Copyright (C) 2008-2010 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Joseph Donaghy # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals source ./globals_accounting source ./inc21.21_tests set test_id "21.21" set exit_code 0 set test_qos "test$test_id\_qos" set file_in "test.$test_id.input" set ta "test$test_id-account.1" set maxcpu MaxCpus set maxcpu_num 0 set grcpu GrpCpus set grcpu_num 0 set timeout 60 set test_node " " # cr_core = 1 / cr_cpu = 0 set selectparam 0 set one_task_pc 0 # test maxjob maxnode maxsubmit maxwall array set acct_mod_desc {} array set acct_mod_acct_vals {} array set acct_mod_assoc_vals { grpnode "-N 1" grpwall "-t 1" grpcpus "" grpcpumins "" grpjobsub "2 4" grpcpurunmins "" maxnode "-N 1" maxwall "-t 10" maxcpus "" maxcpumins "" maxjobsub "2 4" } array set acct_mod_assoc_test_vals { grpnode -1 grpwall -1 grpcpus -1 grpcpumins -1 grpjob -1 grpsubmit -1 maxnode -1 maxwall -1 maxcpus -1 maxcpumins -1 maxjob -1 maxsubmit -1 } print_header $test_id if { [string compare [priority_type] multifactor] } { send_user "\nWARNING: test only compatible with priority/multifactor plugin\n" exit $exit_code } # Determine what the selecttype param is if {[test_select_type_params "CR_CORE"]} { set selectparam 1 } # Determine what the selecttype param is if {[test_select_type_params "CR_ONE_TASK_PER_CORE"]} { set one_task_pc 1 } set got_node 0 spawn $srun -N1 -t1 printenv SLURM_NODELIST expect { -re "($alpha_numeric_under)" { set test_node $expect_out(1,string) set got_node 1 exp_continue } timeout { send_user "\nFAILURE: srun is not responding\n" set exit_code 1 } eof { wait } } if {$got_node != 1} { send_user "\nFAILURE: did not get node for testing\n" exit 0 } lassign [get_node_cpus $test_node] totcpus nthreads if {$totcpus == 0} { send_user "\nFAILURE: no cpus where found\n" exit 1 } else { # Set assoc CPU values set acct_mod_assoc_vals(grpcpus) "-n [expr $totcpus - $nthreads]" set acct_mod_assoc_vals(maxcpus) "-n [expr $totcpus - $nthreads]" set acct_mod_assoc_vals(grpcpumins) "-n [expr $totcpus - $nthreads]" set acct_mod_assoc_vals(maxcpumins) "-n [expr $totcpus - $nthreads]" set acct_mod_assoc_vals(grpcpurunmins) "-n [expr $totcpus - $nthreads]" } proc assoc_setup { limit_type limit_val } { global acct_mod_assoc_test_vals global acct_mod_desc acct_mod_acct_vals acct_mod_assoc_vals ta set exit_code 0 set new_limit [lindex $limit_val 1] set acct_mod_assoc_test_vals($limit_type) $new_limit set exit_code [mod_acct $ta [array get acct_mod_desc] [array get acct_mod_assoc_test_vals] [array get acct_mod_acct_vals]] if { $exit_code } { return $exit_code } } proc _test_limits { } { global file_in srun sbatch squeue scancel bin_id number bin_sleep bin_rm ta maxjob_lim maxsub_lim global acct_mod_desc acct_mod_acct_vals acct_mod_assoc_vals acct_mod_assoc_test_vals one_task_pc nthreads set exit_code 0 # Test jobs within the association limits foreach option [array names acct_mod_assoc_vals] { send_user "\nSetting up association limit $option...\n" if { [string compare $option "maxjobsub"] && [string compare $option "grpjobsub"] } { assoc_setup $option $acct_mod_assoc_vals($option) if { ![string compare $option "maxcpumins"] && $one_task_pc } { set acct_mod_assoc_vals(maxcpumins) "-n [expr [lindex $acct_mod_assoc_vals(maxcpumins) 1] / $nthreads]" } if { ![string compare $option "grpwall"] } { if { [inc21_21_grpwall $option \ $acct_mod_assoc_vals($option)] } { set exit_code 1 return $exit_code } } elseif { ![string compare -length 3 $option "grp"] } { if { [inc21_21_grp_test $option \ $acct_mod_assoc_vals($option)] } { set exit_code 1 return $exit_code } } else { # # Test value within the association limit # if { [inc21_21_good $option \ $acct_mod_assoc_vals($option)] } { set exit_code 1 return $exit_code } # # Test value over the association limit # if { [inc21_21_bad $option \ $acct_mod_assoc_vals($option)] } { set exit_code 1 return $exit_code } } # Reset the limit set acct_mod_assoc_test_vals($option) "-1" } else { if { [inc21_21_submit_test $option] } { set exit_code 1 return $exit_code } } # Reset usage reset_account_usage "" $ta } return $exit_code } if {[test_linear]} { set def_part_name [default_partition] set nb_nodes [get_node_cnt_in_part $def_part_name] if {$nb_nodes < 2} { send_user "\nWARNING: This test is incompatible with select/linear and only one node\n" exit $exit_code } } # # Check accounting config and bail if not found. # if { [test_account_storage] == 0 } { send_user "\nWARNING: This test can't be run without a usable AccountStorageType\n" exit 0 } if { [string compare [check_accounting_admin_level] "Administrator"] } { send_user "\nWARNING: This test can't be run without being an Accounting administrator.\nUse: sacctmgr mod user \$USER set admin=admin.\n" exit 0 } # # Identify the user and his current default account # set acct_name "" set user_name [get_my_user_name] set s_pid [spawn $sacctmgr show user $user_name] expect { -re "$user_name *($alpha_numeric_under)" { set acct_name $expect_out(1,string) exp_continue } timeout { send_user "FAILURE: sacctmgr add not responding\n" slow_kill $s_pid exit 1 } eof { wait } } # # remove any vestigial account # set aamatches 0 set sadd_pid [spawn $sacctmgr -i delete account $ta] expect { timeout { send_user "\nFAILURE: sacctmgr add not responding\n" slow_kill $sadd_pid set exit_code 1 } eof { wait } } # # Use sacctmgr to add an account # set aamatches 0 set sadd_pid [spawn $sacctmgr -i add account $ta] expect { -re "Adding Account" { incr aamatches exp_continue } -re "Nothing new added" { send_user "\nWARNING: vestigial account $ta found\n" incr aamatches exp_continue } timeout { send_user "\nFAILURE: sacctmgr add not responding\n" slow_kill $sadd_pid set exit_code 1 } eof { wait } } if {$aamatches != 1} { send_user "\nFAILURE: sacctmgr had a problem adding account.\n" exit 1 } # # Add self to this new account # set sadd_pid [spawn $sacctmgr -i create user name=$user_name account=$ta] expect { timeout { send_user "\nFAILURE: sacctmgr add not responding\n" slow_kill $sadd_pid set exit_code 1 } eof { wait } } # # Remove test QoS # set match 0 spawn $sacctmgr -i delete qos $test_qos expect { -re "Deleting QOS" { set match 1 exp_continue } timeout { send_user "\nFAILURE: sacctmgr delete not responding\n" set exit_code 1 } eof { wait } } set match 0 spawn $sacctmgr -i create qos $test_qos expect { -re "Adding QOS" { set match 1 exp_continue } timeout { send_user "\nFAILURE: sacctmgr is not responding\n" set exit_code 1 } eof { wait } } if {$match != 1} { send_user "\nFAILURE: $test_qos was not created\n" exit 1 } spawn $sacctmgr -i mod account $ta set qos=$test_qos expect { timeout { send_user "\nFAILURE: sacctmgr is not responding\n" set exit_code 1 } eof { wait } } # # Spawn a job via salloc using this account # set job_id 0 set matches 0 set timeout $max_job_delay spawn $salloc -N1 --account=$ta expect { -re "Granted job allocation ($number)" { set job_id $expect_out(1,string) send "$scontrol show job $job_id\r" send "exit\r" exp_continue } -re "Account=$ta" { incr matches exp_continue } timeout { send_user "\nFAILURE: salloc not responding\n" set exit_code 1 } eof { wait } } if {$job_id == 0} { send_user "\nFAILURE: salloc failed to initiate job\n" set exit_code 1 } elseif {$matches != 1} { send_user "\nFAILURE: salloc failed to use desired account\n" set exit_code 1 } # # Spawn a job via sbatch using this account # make_bash_script $file_in "$bin_id" set job_id 0 spawn $sbatch -N1 --account=$ta --output=none $file_in expect { -re "Submitted batch job ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: sbatch not responding\n" set exit_code 1 } eof { wait } } if {$job_id == 0} { send_user "\nFAILURE: did not get sbatch job_id\n" set exit_code 1 } else { set matches 0 spawn $scontrol show job $job_id expect { -re "Account=$ta" { incr matches exp_continue } timeout { send_user "\nFAILURE: scontrol not responding\n" set exit_code 1 } eof { wait } } if {$matches != 1} { send_user "\nFAILURE: sbatch failed to use specified account\n" set exit_code 1 } cancel_job $job_id } # # Spawn a job via srun using this account # set job_id 0 spawn $srun -N1 -v --account=$ta -t1 $bin_id expect { -re "launching ($number)" { set job_id $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" set exit_code 1 } eof { wait } } if {$job_id == 0} { send_user "\nFAILURE: did not get srun job_id\n" set exit_code 1 } else { set matches 0 spawn $scontrol show job $job_id expect { -re "Account=$ta" { incr matches exp_continue } timeout { send_user "\nFAILURE: scontrol not responding\n" set exit_code 1 } eof { wait } } if {$matches != 1} { send_user "\nFAILURE: srun failed to use specified account\n" set exit_code 1 } } # # Check to see if limits are enforced. # if { [test_limits_enforced] == 1 } { set exit_code [_test_limits] # wait around a bit to make sure jobs finish sleep 3 } # # Use sacctmgr to delete the test account # # First wait a few seconds for the job to complete sleep 2 set damatches 0 set sadel_pid [spawn $sacctmgr -i delete account $ta] expect { -re "Deleting account" { incr damatches exp_continue } timeout { send_user "\nFAILURE: sacctmgr delete not responding\n" slow_kill $sadel_pid set exit_code 1 } eof { wait } } if {$damatches != 1} { send_user "\nFAILURE: sacctmgr had a problem deleting account\n" set exit_code 1 } # # Remove test QoS # set match 0 spawn $sacctmgr -i delete qos $test_qos expect { -re "Deleting QOS" { set match 1 exp_continue } timeout { send_user "\nFAILURE: sacctmgr delete not responding\n" set exit_code 1 } eof { wait } } if {$exit_code == 0} { exec $bin_rm -f $file_in print_success $test_id } exit $exit_code