#!/usr/bin/env expect ############################################################################ # Purpose: Test of Slurm functionality # Test --gpu-freq options # # Output: "TEST: #.#" followed by "SUCCESS" if test was successful, OR # "FAILURE: ..." otherwise with an explanation of the failure, OR # anything else indicates a failure mode that must be investigated. ############################################################################ # Copyright (C) 2018 SchedMD LLC # Written by Morris Jette # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the included file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. ############################################################################ source ./globals set test_id "39.9" set exit_code 0 set file_in "test$test_id.input" set number_commas "\[0-9_,\]+" set freq_parse_nvml "GpuFreq=memory_freq:($number),graphics_freq:($number)" set freq_parse_generic "GpuFreq=control_disabled" set generic_msg "The gpu/generic plugin is loaded, so Slurm can't really test GPU frequency operations. Please set `Autodetect=nvml` in gres.conf to load the gpu/nvml plugin instead." print_header $test_id if {[have_nvml] == 0} { log_warn "NVML must be installed and enabled to test GPU frequency operations." exit 0 } if {[slurmd_user_root] == 0} { log_warn "SlurmdUser must be root to test GPU frequency operations." exit 0 } print_time if {[test_cons_tres]} { send_user "\nValid configuration, using select/cons_tres\n" } else { send_user "\nWARNING: This test is only compatible with select/cons_tres\n" exit 0 } set gpu_cnt [get_gpu_count 1] if {$gpu_cnt < 0} { send_user "\nFAILURE: Error getting GPU count\n" exit 1 } if {$gpu_cnt < 1} { send_user "\nWARNING: This test requires 1 or more GPU in the default partition\n" exit 0 } get_node_config send_user "\nGPU count is $gpu_cnt\n" # # Build input script file # exec $bin_rm -f $file_in make_bash_script $file_in "echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES $scontrol -dd show job \$SLURM_JOB_ID exit 0" # # Test of --gpu-freq=low,verbose # send_user "\n\nTEST 1\n" set timeout $max_job_delay set match 0 set srun_pid [spawn $srun --gpus-per-node=1 --gpu-freq=low,verbose -J "test$test_id" -t1 ./$file_in] expect { -re $freq_parse_nvml { incr match exp_continue } -re $freq_parse_generic { # Emit warning once and exit log_warn $generic_msg exit $exit_code } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$match != 1} { send_user "\nFAILURE: srun '--gpu-freq=low,verbose' failure ($match != 1)\n" set exit_code 1 } # # Test of --gpu-freq=medium,memory=medium,verbose # send_user "\n\nTEST 2\n" set match 0 set srun_pid [spawn $srun --gpus-per-node=1 --gpu-freq=medium,memory=medium,verbose -J "test$test_id" -t1 ./$file_in] expect { -re $freq_parse_nvml { incr match exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$match != 1} { send_user "\nFAILURE: srun '--gpu-freq=medium,memory=medium,verbose' failure ($match != 1)\n" set exit_code 1 } # # Test of --gpu-freq=highm1,verbose # send_user "\n\nTEST 3\n" set match 0 set srun_pid [spawn $srun --gpus-per-node=1 --gpu-freq=highm1,verbose -J "test$test_id" -t1 ./$file_in] expect { -re $freq_parse_nvml { incr match exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$match != 1} { send_user "\nFAILURE: srun '--gpu-freq=highm1,verbose' failure ($match != 1)\n" set exit_code 1 } # # Test of --gpu-freq=high,memory=high,verbose # send_user "\n\nTEST 4\n" set hostname "UNKNOWN" set match 0 set srun_pid [spawn $srun --gpus-per-node=1 --gpu-freq=high,memory=high,verbose -J "test$test_id" -t1 ./$file_in] expect { -re $freq_parse_nvml { set high_freq $expect_out(1,string) set high_mem $expect_out(2,string) incr match exp_continue } -re " NodeList=($controlmachine_regex)" { set hostname $expect_out(1,string) incr match exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$match == 2} { send_user "\n\nTEST 5\n" set srun_pid [spawn $srun -w $hostname --gpus-per-node=1 --gpu-freq=medium,memory=medium,verbose -J "test$test_id" -t1 ./$file_in] expect { -re $freq_parse_nvml { set medium_freq $expect_out(1,string) set medium_mem $expect_out(2,string) incr match exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } send_user "\n\nTEST 6\n" set srun_pid [spawn $srun -w $hostname --gpus-per-node=1 --gpu-freq=low,memory=low,verbose -J "test$test_id" -t1 ./$file_in] expect { -re $freq_parse_nvml { set low_freq $expect_out(1,string) set low_mem $expect_out(2,string) incr match exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$match != 4} { send_user "\nFAILURE: srun '--gpu-freq=x,memory=x,verbose' failure ($match != 4)\n" set exit_code 1 } else { if {$low_freq > $medium_freq || $medium_freq > $high_freq} { send_user "\nFAILURE: GPU frequency low > medium or medium > high\n" set exit_code 1 } if {$low_mem > $medium_mem || $medium_mem > $high_mem} { send_user "\nFAILURE: GPU memory frequency low > medium or medium > high\n" set exit_code 1 } } } else { send_user "\nFAILURE: srun '--gpu-freq=x,memory=x,verbose' failure ($match != 2)\n" set exit_code 1 } # # Test of --gpu-freq=verbose # Frequency will be system default (see "GpuFreqDef" in slurm.conf) # send_user "\n\nTEST 7\n" set match 0 set srun_pid [spawn $srun --gpus-per-node=1 --gpu-freq=verbose -J "test$test_id" -t1 ./$file_in] expect { -re $freq_parse_nvml { incr match exp_continue } timeout { send_user "\nFAILURE: srun not responding\n" slow_kill $srun_pid set exit_code 1 } eof { wait } } if {$match != 1} { send_user "\nFAILURE: srun '--gpu-freq=verbose' failure ($match != 1)\n" set exit_code 1 } if {$exit_code == 0} { exec $bin_rm -f $file_in send_user "\nSUCCESS\n" } print_time exit $exit_code