#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test of CPU affinity support for multi-core systems. # # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR # "WARNING: ..." with an explanation of why the test can't be made, OR # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ # Copyright (C) 2005-2007 The Regents of the University of California. # Copyright (C) 2008 Lawrence Livermore National Security. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ source ./globals set test_id "1.91" set exit_code 0 set file_prog "test$test_id.prog" set prompt "PROMPT:" print_header $test_id if {[test_select_type_params "CR_ONE_TASK_PER_CORE"]} { send_user "\nWARNING: This test is incompatible SelectTypeParameters=CR_ONE_TASK_PER_CORE\n" exit 0 } # # Test if CPU affinity support is supported. # if {![test_cpu_affinity]} { send_user "\nWARNING: CPU affinity not supported on this system\n" exit 0 } send_user "\ntask affinity plugin installed\n" set task_params [string tolower [get_affinity_params]] if {[string first $task_params "boards"] != -1 || [string first $task_params "sockets"] != -1 || [string first $task_params "cores"] != -1} { send_user "\nWARNING: TaskPluginParams=$task_params not supported\n" exit 0 } set force 0 log_user 0 spawn $scontrol show partition [default_partition] expect { -re "OverSubscribe=FORCE" { set force 1 exp_continue } eof { wait } } log_user 1 if {$force == 1} { send_user "\nWARNING: This test is not compatible with OverSubscribe=FORCE\n" exit 0 } # Identify a usable node set timeout $max_job_delay set node_name "" set srun_pid [spawn $srun -N1 --exclusive --verbose $bin_printenv SLURMD_NODENAME] expect { -re "on host ($alpha_numeric_under)" { set node_name $expect_out(1,string) exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {[string compare $node_name ""] == 0} { send_user "\nFAILURE: failed to get a usable node name\n" exit 1 } # Determine how many cpus, sockets, cores, and threads the node has set num_cputot 0 set num_sockets 0 set num_cores 0 set num_threads 0 log_user 0 spawn $scontrol show node $node_name expect { -re "CoresPerSocket=($number)" { set num_cores $expect_out(1,string) exp_continue } -re "CPUTot=($number)" { set num_cputot $expect_out(1,string) exp_continue } -re "Sockets=($number)" { set num_sockets $expect_out(1,string) exp_continue } -re "ThreadsPerCore=($number)" { set num_threads $expect_out(1,string) exp_continue } eof { wait } } log_user 1 if {$num_cputot == 0 || $num_sockets == 0 || $num_cores == 0 || $num_threads == 0} { send_user "\nWARNING: Could not determine number of CPUs:Sockets:Cores:Threads (saw $num_cputot:$num_sockets:$num_cores:$num_threads)\n" exit 0 } # Intel KNL nodes with 272 threads generate huge numeric masks # which are too large for expect integer values to manage set total_thread_count [expr $num_sockets * $num_cores * $num_threads] if {$total_thread_count > 64} { send_user "\nWARNING: Total thread count too large for Expect to process ($total_thread_count > 64)\n" send_user "WARNING: Expect unable to work with more than 32-bit numbers\n" exit 0 } send_user "Node config: CPUs=$num_cputot Sockets=$num_sockets Cores=$num_cores Threads=$num_threads\n\n" if {$num_cputot < $total_thread_count} { send_user "WARNING: CPUs=$num_cputot < (Sockets($num_sockets)*Cores($num_cores)*Threads($num_threads))\n" send_user "WARNING: Test incompatible with lowered CPUs count.\n" exit 0 } # # Build a test program to report affinity by task # exec $bin_rm -f $file_prog exec $bin_cc -I$build_dir $file_prog.c -o $file_prog exec $bin_chmod 700 $file_prog # # Create an allocation # global env set env(SLURM_CPU_BIND) "verbose" set salloc_pid [spawn $salloc -w $node_name -N1 --exclusive --verbose -t2 $bin_bash] expect { -re "Granted job allocation ($number)" { set job_id $expect_out(1,string) send "export PS1=\"$prompt\"\r" exp_continue } -re "export PS1=\"$prompt\"\r" { exp_continue } timeout { send_user "\nFAILURE: salloc not responding " send_user "or failure to recognize prompt\n" slow_kill $salloc_pid exit 1 } -re $prompt { } } set timeout 1 expect { -re $prompt { exp_continue } timeout { } } set timeout 30 # # Reading a second prompt is required by some versions of Expect # set timeout 1 expect { -re $prompt { exp_continue } timeout { } } set timeout 30 ############################################################################# # # Run a job step to get allocated processor count and affinity # set mask 0 set task_cnt 0 send "$srun -c1 ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_cnt set mask $expect_out(2,string) exp_continue } -re "error" { send_user "\nFAILURE: some error occurred\n" set exit_code 1 exp_continue } timeout { send_user "\nFAILURE: salloc not responding " send_user "or failure to recognize prompt\n" slow_kill $salloc_pid exit 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } ############################################################################# # # Run a job step with affinity to verify unique masks with min -B 1:1:1 # set expected_mask [ expr ((1 << $task_cnt) - 1) ] set task_mask 0 send "$srun -c1 -n $task_cnt -B 1:1:1 ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) exp_continue } -re "error" { send_user "\nFAILURE: some error occurred\n" set exit_code 1 exp_continue } timeout { send_user "\nFAILURE: srun (from --allocate) not responding " send_user "or failure to recognize prompt\n" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$task_mask != $expected_mask} { send_user "\nFAILURE: affinity mask inconsistency ($task_mask,$expected_mask)\n" set exit_code 1 } ############################################################################# # # Run varying number of sockets, verify task count and number of set bits # set this_cnt 1 while {$this_cnt <= $num_sockets} { set expected_tasks [ expr $this_cnt * $num_cores * $num_threads ] set num_tasks 0 set num_bits 0 set task_mask 0 send "$srun -B $this_cnt-$this_cnt:$num_cores:$num_threads ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) incr num_tasks 1 # count number of set bits set this_mask $expect_out(2,string) while {$this_mask > 0} { if {$this_mask & 1} { incr num_bits 1 } set this_mask [ expr $this_mask >> 1 ] } exp_continue } -re "error" { send_user "\nFAILURE: some error occurred\n" set exit_code 1 exp_continue } timeout { send_user "\nFAILURE: srun (from --allocate) not responding " send_user "or failure to recognize prompt\n" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$num_tasks != $expected_tasks} { send_user "\nFAILURE: number of tasks inconsistent ($num_tasks,$expected_tasks)\n" set exit_code 1 } if {$num_bits != $expected_tasks} { send_user "\nFAILURE: number of set bits inconsistent ($num_bits,$expected_tasks)\n" set exit_code 1 } incr this_cnt 1 } ############################################################################# # # Run varying number of cores, verify task count and number of set bits # set this_cnt 1 while {$this_cnt <= $num_cores} { set expected_tasks [ expr $num_sockets * $this_cnt * $num_threads ] set num_tasks 0 set num_bits 0 set task_mask 0 send "$srun -B $num_sockets:$this_cnt-$this_cnt:$num_threads ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) incr num_tasks 1 # count number of set bits set this_mask $expect_out(2,string) while {$this_mask > 0} { if {$this_mask & 1} { incr num_bits 1 } set this_mask [ expr $this_mask >> 1 ] } exp_continue } -re "error" { send_user "\nFAILURE: some error occurred\n" set exit_code 1 exp_continue } timeout { send_user "\nFAILURE: salloc not responding " send_user "or failure to recognize prompt\n" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$num_tasks != $expected_tasks} { send_user "\nFAILURE: number of tasks inconsistent ($num_tasks,$expected_tasks)\n" set exit_code 1 } if {$num_bits != $expected_tasks} { send_user "\nFAILURE: number of set bits inconsistent ($num_bits,$expected_tasks)\n" set exit_code 1 } incr this_cnt 1 } ############################################################################# # # Run varying number of threads, verify task count and number of set bits # set this_cnt 1 while {$this_cnt <= $num_threads} { set expected_tasks [ expr $num_sockets * $num_cores * $this_cnt ] set num_tasks 0 set num_bits 0 set task_mask 0 send "$srun -B $num_sockets:$num_cores:$this_cnt-$this_cnt ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) incr num_tasks 1 # count number of set bits set this_mask $expect_out(2,string) while {$this_mask > 0} { if {$this_mask & 1} { incr num_bits 1 } set this_mask [ expr $this_mask >> 1 ] } exp_continue } -re "error" { send_user "\nFAILURE: some error occurred\n" set exit_code 1 exp_continue } timeout { send_user "\nFAILURE: salloc not responding " send_user "or failure to recognize prompt\n" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$num_tasks != $expected_tasks} { send_user "\nFAILURE: number of tasks inconsistent ($num_tasks,$expected_tasks)\n" set exit_code 1 } if {$num_bits != $expected_tasks} { send_user "\nFAILURE: number of set bits inconsistent ($num_bits,$expected_tasks)\n" set exit_code 1 } incr this_cnt 1 } ############################################################################# # # Run varying cpus per task, verify task count and number of set bits # set this_cnt 1 while {$this_cnt <= $task_cnt} { set expected_tasks 1 set num_tasks 0 set num_bits 0 set task_mask 0 send "$srun -c$this_cnt -B 1:1:1 ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) incr num_tasks 1 # count number of set bits set this_mask $expect_out(2,string) while {$this_mask > 0} { if {$this_mask & 1} { incr num_bits 1 } set this_mask [ expr $this_mask >> 1 ] } exp_continue } -re "error" { send_user "\nFAILURE: some error occurred\n" set exit_code 1 exp_continue } timeout { send_user "\nFAILURE: salloc not responding " send_user "or failure to recognize prompt\n" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$num_tasks != $expected_tasks} { send_user "\nFAILURE: number of tasks inconsistent ($num_tasks,$expected_tasks)\n" set exit_code 1 } if {$num_bits != $this_cnt} { send_user "\nFAILURE: number of set bits inconsistent ($num_bits,$this_cnt)\n" set exit_code 1 } incr this_cnt 1 } ############################################################################# # # Run a job step with plane distribution to exercise option # Automatic binding in slurm version 2.0 will bind one task per core # set expected_mask [ expr ((1 << $task_cnt) - 1) ] set task_mask 0 send "$srun -n $task_cnt -m plane=4 ./$file_prog\r" expect { -re "TASK_ID:($number),MASK:($number)" { incr task_mask $expect_out(2,string) exp_continue } -re "error" { send_user "\nFAILURE: some error occurred\n" set exit_code 1 exp_continue } timeout { send_user "\nFAILURE: salloc not responding " send_user "or failure to recognize prompt\n" set exit_code 1 } -re "$srun" { # just so we don't grab the srun call exp_continue; } -re $prompt } if {$task_mask != $expected_mask} { send_user "\nFAILURE: affinity mask inconsistency ($task_mask,$expected_mask)\n" set exit_code 1 } ############################################################################# # # Terminate the job, free the allocation # send "exit\r" expect { -re "error" { send_user "\nFAILURE: some error occurred\n" set exit_code 1 } timeout { send_user "\nFAILURE: salloc not responding " send_user "or failure to recognize prompt\n" slow_kill $salloc_pid set exit_code 1 } eof { wait } } if {$exit_code == 0} { exec $bin_rm -f $file_prog send_user "\nSUCCESS\n" } elseif { [test_config_overrides] == 1 } { exec $bin_rm -f $file_prog send_user "\nNOTE: This test can fail if the node configuration in slurm.conf\n" send_user " (sockets, cores, threads) differs from the actual configuration\n" send_user " or if using task/cgroup without task/affinity.\n" } else { send_user "\nFAILURE\n" set exit_code 1 } exit $exit_code