#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Submit job directly to slurmd without use of slurmctld scheduler. # (--no-allocate option). NOTE: Needs to run as SlurmUser or root. # # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR # "WARNING: ..." with an explanation of why the test can't be made, OR # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ # Copyright (C) 2002-2007 The Regents of the University of California. # Copyright (C) 2008 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set test_id "1.26" set iterations 100 set exit_code 0 print_header $test_id if {[is_super_user] == 0} { send_user "\nWARNING: This test can't be run except as SlurmUser\n" exit 0 } if {[test_front_end]} { send_user "\nWARNING: This test is incompatible with front-end systems\n" exit $exit_code } set switch [switch_type] if {[string compare $switch "none"]} { send_user "\nWARNING: This test is incompatible with switch/$switch\n" exit $exit_code } # # Submit a 1 node job and record the node name # # NOTE: Check explicity for "^0:" or "\n0:. Otherwise in srun verbose mode we # can get a hostname ending with 0 in the messages that gets used to generate # a bad value for host_0 below. # set host_0 "" set nodelist_name "" set timeout $max_job_delay set srun_pid [spawn $srun -v -N1 -l -t1 $bin_printenv SLURMD_NODENAME] expect { -re "on host ($alpha_numeric_under)," { set nodelist_name $expect_out(1,string) exp_continue } -re "^0: ($alpha_numeric_under)" { set host_0 $expect_out(1,string) exp_continue } -re "\n0: ($alpha_numeric_under)" { set host_0 $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } # # Verify node count # if {[string compare $host_0 ""] == 0} { send_user "\nFAILURE: Did not get hostname of task 0\n" exit 1 } if {[string compare $nodelist_name ""] == 0} { send_user "\nFAILURE: Did not get nodelist_name of task 0\n" exit 1 } if {[string compare $host_0 $nodelist_name] != 0} { send_user "\nWARNING: hostname inconsistency\n" } set include_node $host_0 # # Submit a job directly to that node # set host_1 "" set slurm_user 1 set timeout 10 set srun_pid [spawn $srun -N1 -l --nodelist=$include_node --no-allocate -t1 $bin_printenv SLURMD_NODENAME] expect { -re "Invalid job credential" { send_user "\nWARNING: Not SlurmUser or root.\n" set slurm_user 0 exp_continue } -re "error: .*try again" { send_user "Can't avoid this possible error\n" set host_1 $host_0 exp_continue } -re "error: .*already in shared memory" { send_user "Can't avoid this possible error\n" set host_1 $host_0 exp_continue } -re "error: .*exit code 1" { exp_continue } -re "0: ($alpha_numeric_under)" { set host_1 $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: srun not responding.\n" slow_kill $srun_pid set slurm_user 0 set exit_code 1 } eof { wait } } if {$slurm_user == 0} { exit 0; } if {[string compare $host_1 $include_node]} { send_user "\nFAILURE: Allocation lacked an included node\n" set exit_code 1 } # # Run three tasks at a time on some node and do so repeatedly # This checks for slurmd race conditions # The sleep between cycles is to make sure the job step completion # logic has time to be processed (slurmd -> slurmctld messages) # Note: process output in order of expected completion # set front_end [test_front_end] set successes 0 for {set inx 0} {$inx < $iterations} {incr inx} { exec $bin_sleep 0.25 set failures 0 set srun_pid [spawn $srun -N1 --nodelist=$nodelist_name -t1 -l $bin_printenv SLURMD_NODENAME] set alloc $spawn_id set srun_pid1 [spawn $srun -N1 --nodelist=$include_node -t1 -Z $bin_sleep 0.5] set noalloc1 $spawn_id set srun_pid2 [spawn $srun -N1 --nodelist=$include_node -t1 -Z $bin_sleep 0.25] set noalloc2 $spawn_id set timeout 20 set spawn_id $noalloc2 expect { -i $noalloc2 -re "error:.*configuring interconnect" { send_user "Can't avoid this possible error\n" exp_continue } -re "error:" { send_user "\nFAILURE: some error happened\n" set failures 1 exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid2 set failures 1 } eof { wait } } set spawn_id $noalloc1 expect { -i $noalloc1 -re "error:.*configuring interconnect" { send_user "Can't avoid this possible error\n" exp_continue } -re "error:" { send_user "\nFAILURE: some error happened\n" set failures 1 exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid1 set failures 1 } eof { wait } } set timeout $max_job_delay set spawn_id $alloc expect { -i $alloc -re "Invalid node name specified" { send_user "\nFAILURE: some error happened\n" set failures 1 exp_continue } -re "error:.*configuring interconnect" { send_user "Can't avoid this possible error\n" exp_continue } -re "error:" { send_user "\nFAILURE: some error happened\n" set failures 1 exp_continue } -re "0: ($alpha_numeric_under)" { set host_0 $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set failures 1 } eof { wait } } if {$failures == 0} { incr successes } else { set exit_code 1 } } if {$successes != $iterations} { send_user "\nFAILURE: only $successes of $iterations completed successfully\n" } if {$exit_code == 0} { send_user "\nSUCCESS\n" } exit $exit_code