#!/usr/bin/env expect ############################################################################ # Purpose: Establish global state information for Slurm test suite # # To define site-specific state information, set the values in a file # named 'globals.local'. Those values will override any specified here. # for example: # # $ cat globals.local # set slurm_dir "/usr/local" # set build_dir "/home/mine/SLURM/build_smd" # set src_dir "/home/mine/SLURM/slurm.git" # set mpicc "/usr/local/bin/mpicc" # # If you want to have more than one test going at the same time for multiple # installs you can have multiple globals.local files and set the # SLURM_LOCAL_GLOBALS_FILE env var, and have that set to the correct # globals.local file for your various installs. The file can be named anything, # not just globals.local. # ############################################################################ # Copyright (C) 2002-2007 The Regents of the University of California. # Copyright (C) 2008-2010 Lawrence Livermore National Security. # Portions Copyright (C) 2010-2018 SchedMD LLC. # Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). # Written by Morris Jette # Additions by Joseph Donaghy # CODE-OCEC-09-009. All rights reserved. # # This file is part of Slurm, a resource management program. # For details, see . # Please also read the supplied file: DISCLAIMER. # # Slurm is free software; you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation; either version 2 of the License, or (at your option) # any later version. # # Slurm is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with Slurm; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ############################################################################ # Avoid sourcing this file multiple times if {[info procs exit] eq "exit"} { return } # # Include private functions and variables # source ./globals_private global sacctmgr sacct salloc sattach sbatch sbcast scancel scontrol sinfo global smd squeue sreport srun sstat strigger ################################################################ # # NAME # cset - conditional set # # SYNOPSIS # cset name value # # DESCRIPTION # Conditional set. Only set variable if variable does not yet exist. # # Input: name -- name of the variable to set # value -- value to set to 'name' # ################################################################ proc cset {name value} { if {![uplevel 1 info exists $name]} { upvar $name tmp set tmp $value } } # # Defining colors here to be able to use them in globals.local. # By default, these colors are bold # set COLOR_RED "\033\[1;31m" set COLOR_RED_NORMAL "\033\[31m" set COLOR_ORANGE "\033\[1;38;5;208m" set COLOR_YELLOW "\033\[1;33m" set COLOR_GREEN "\033\[1;32m" set COLOR_BLUE "\033\[1;34m" set COLOR_MAGENTA "\033\[1;35m" set COLOR_CYAN "\033\[1;36m" set COLOR_NONE "\033\[0m" cset local_globals_file "./globals.local" # Log level "enum" # Define log levels here so they are available in globals.local set LOG_LEVEL_QUIET 0 set LOG_LEVEL_FATAL 1 set LOG_LEVEL_ERROR 2 set LOG_LEVEL_WARNING 3 set LOG_LEVEL_INFO 4 set LOG_LEVEL_DEBUG 5 set LOG_LEVEL_TRACE 6 if {[info exists env(SLURM_LOCAL_GLOBALS_FILE)]} { set local_globals_file $env(SLURM_LOCAL_GLOBALS_FILE) } if [file exists $local_globals_file] { source $local_globals_file } # # Specify the slurm install directory. # Used to locate binaries, libraries, and header files. # cset slurm_dir "/usr" cset build_dir "../../" cset src_dir "../../" cset config_h "${build_dir}/config.h" cset sacctmgr "${slurm_dir}/bin/sacctmgr" cset sacct "${slurm_dir}/bin/sacct" cset salloc "${slurm_dir}/bin/salloc" cset sattach "${slurm_dir}/bin/sattach" cset sbatch "${slurm_dir}/bin/sbatch" cset sbcast "${slurm_dir}/bin/sbcast" cset scancel "${slurm_dir}/bin/scancel" cset scontrol "${slurm_dir}/bin/scontrol" cset sdiag "${slurm_dir}/bin/sdiag" cset sgather "${slurm_dir}/bin/sgather" cset sh5util "${slurm_dir}/bin/sh5util" cset sinfo "${slurm_dir}/bin/sinfo" cset smd "${slurm_dir}/bin/smd" cset sprio "${slurm_dir}/bin/sprio" cset squeue "${slurm_dir}/bin/squeue" cset srun "${slurm_dir}/bin/srun" cset sreport "${slurm_dir}/bin/sreport" cset sshare "${slurm_dir}/bin/sshare" cset sstat "${slurm_dir}/bin/sstat" cset strigger "${slurm_dir}/bin/strigger" cset slurmd "${slurm_dir}/sbin/slurmd" cset pbsnodes "${slurm_dir}/bin/pbsnodes" cset qdel "${slurm_dir}/bin/qdel" cset qstat "${slurm_dir}/bin/qstat" cset qsub "${slurm_dir}/bin/qsub" cset qalter "${slurm_dir}/bin/qalter" cset qrerun "${slurm_dir}/bin/qrerun" cset lsid "${slurm_dir}/bin/lsid" cset bjobs "${slurm_dir}/bin/bjobs" cset bkill "${slurm_dir}/bin/bkill" cset bsub "${slurm_dir}/bin/bsub" # If using MPICH-2 or other version of MPI requiring pmi libary, use this #cset mpicc "/home/jette/mpich2-install/bin/mpicc" #cset use_pmi 1 # OR for other versions of MPICH, use this cset mpicc "/usr/local/bin/mpicc" cset nvcc "/usr/bin/nvcc" cset use_pmi 0 #cset upcc "/usr/local/bin/upcc" cset upcc "/usr/bin/xlupc" cset oshcc "/usr/local/bin/oshcc" cset mpirun "mpirun" cset totalviewcli "/usr/local/bin/totalviewcli" # Set if using "--enable-memory-leak-debug" configuration option cset enable_memory_leak_debug 0 # test_prompt: to be used as prompt for interactive shells set test_prompt "TEST_PROMPT: " # reset_bash_prompt: to be used as command on scripts or interactive jobs set reset_bash_prompt "unset PROMPT_COMMAND; export PS1=\"$test_prompt\"" # # Specify locations of other executable files used # Only the shell names (e.g. bin_bash) must be full pathnames # cset bin_awk "awk" cset bin_bash [exec which bash | tail -n 1] cset bin_cat "cat" cset bin_cc "gcc" cset bin_chmod "chmod" cset bin_cmp "cmp" cset bin_cp "cp" cset bin_date "date" cset bin_diff "diff" cset bin_echo "echo" cset bin_env "env" cset bin_file "file" cset bin_id "id" cset bin_grep "grep" cset bin_head "head" cset bin_ln "ln" cset bin_perldoc "/usr/bin/perldoc" # Don't user $bin_hostname unless on a front-end system that # doesn't fully use the slurmd, use $bin_printenv SLURMD_NODENAME cset bin_hostname "hostname" cset bin_kill "kill" cset bin_make "make" cset bin_mv "mv" cset bin_od "od" cset bin_pkill "pkill" cset bin_printenv "printenv" cset bin_ps "ps" cset bin_pwd "pwd" cset bin_rm "rm" cset bin_sed "sed" cset bin_sleep "sleep" cset bin_sort "sort" cset bin_sum "sum" cset bin_touch "touch" cset bin_true "true" cset bin_uname "uname" cset bin_uniq "uniq" cset bin_wc "wc" # # Let the commands complete without expect timing out waiting for a # response. Single node jobs submitted to the default partition should # be initiated within this number of seconds. # for interactive slurm jobs: cset timeout $max_job_delay # cset max_job_delay 120 # # Specify the maximum number of tasks to use in the stress tests. # cset max_stress_tasks 4 # # The error message that the "sleep" command prints when we run "sleep aaa". # cset sleep_error_message "(invalid time interval)|(bad character in argument)|(usage: sleep seconds)" # Force LANG, as the expect tests aren't localized set ::env(LANG) "en_US.UTF-8" # Testsuite level variables cset testsuite_shared_dir "[$bin_pwd]" # Testsuite log variables cset testsuite_log_level $LOG_LEVEL_DEBUG cset testsuite_log_format "\[%{timestamp}s.%{msecs}03d] %{loglevel}-7s %{message}s \(%{backtrace}s)" cset testsuite_time_format "%Y-%m-%d %H:%M:%S" # Default to using color if writing to a terminal and not if writing to a file cset testsuite_colorize [dict exists [fconfigure stdout] -mode] cset testsuite_color_fatal $COLOR_RED cset testsuite_color_error $COLOR_RED_NORMAL cset testsuite_color_warn $COLOR_ORANGE cset testsuite_color_info $COLOR_YELLOW cset testsuite_color_debug $COLOR_BLUE cset testsuite_color_trace $COLOR_MAGENTA cset testsuite_color_header $COLOR_NONE cset testsuite_color_success $COLOR_GREEN cset testsuite_color_failure $COLOR_RED cset testsuite_color_skipped $COLOR_ORANGE # Set to true to cause the first subtest failure to immediately end the test cset testsuite_subtest_fatal false # To automatically call cleanup or not when ending the test cset testsuite_cleanup_on_failure true if {[info exists env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE)]} { set testsuite_cleanup_on_failure $env(SLURM_TESTSUITE_CLEANUP_ON_FAILURE) } set subtest_pass_count 0 set subtest_skip_count 0 set subtest_fail_count 0 # Other common variables set re_word_str "\\S+" set digit "\\d" set eol "\r?\n" set float "\\d+\\.?\\d*" set number "\\d+" set format_time "\\d+\\:\\d+\\:\\d+" set number_with_suffix "\\d+\[KM\]*" set slash "/" set whitespace "\\s+" set controlmachine_regex "\\S+" # Any characters except ( , : newline set no_delim "\[^(,:\r\n\]" set no_delim_slash "\[^(,:/\r\n\]" # The first group matches GRES name # The second **optional** group matches GRES type. # The third group matches GRES count. # Test out the regex here: https://regex101.com/r/FlNYKM/7 set gres_regex "($no_delim_slash*):($no_delim*)?:?($no_delim*)" #basic #defines in slurm.h set NO_VAL 0xfffffffe set INFINITE 0xffffffff set SLURM_MAX_NORMAL_STEP_ID 0xfffffff0 set SLURM_EXTERN_CONT 0xfffffffc set SLURM_BATCH_SCRIPT 0xfffffffb # # Global variable used in multiple functions in "globals" file # set gpu_sock_list {} # # Procedure return values # set RETURN_SUCCESS 0 set RETURN_ERROR 1 set RETURN_TIMEOUT 2 ################################################################ # # NAME # fail - fails a test # # SYNOPSIS # fail message # # DESCRIPTION # To be used when an error is fatal for the test. This routine # prints the specified error message, optionally cleans up, prints # a final test failure message, and exits the test with exit code 1. # # ENVIRONMENT # Whether or not the cleanup procedure is called depends on the setting # of the $testsuite_cleanup_on_failure set in the globals.local file or # overridden with the SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment # variable. # # NOTE # DO NOT call this within your local cleanup procedure. # ################################################################ proc fail { message } { # Avoid recursive calls from within cleanup if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { log_error "Local cleanup shouldn't call pass, fail or skip" return } log_fatal $message # _test_fini will handle cleanup and print the failure message. _test_fini 1 } ################################################################ # # NAME # skip - skips a test # # SYNOPSIS # skip message # # DESCRIPTION # To be used when a precondition for the test fails and the test # should be skipped. This routine prints the specified warning message, # calls the cleanup procedure if defined, prints a final test skipped # message, and exits the test with exit code -1 (aka 255). # # NOTE # DO NOT call this within your local cleanup procedure. # ################################################################ proc skip { message } { # Avoid recursive calls from within cleanup if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { log_error "Local cleanup shouldn't call pass, fail or skip" return } log_warn $message # _test_fini will handle cleanup and print the skipped message. _test_fini -1 } ################################################################ # # NAME # pass - passes a test # # SYNOPSIS # pass # # DESCRIPTION # To be used when a test passes and should complete with success. # This routine calls the cleanup procedure if defined, prints a final # test success message, and exits with exit code 0. # # NOTE # DO NOT call this within your local cleanup procedure. # ################################################################ proc pass { } { # Avoid recursive calls from within cleanup if {[info level] > 1 && [lindex [info level -1] 0] eq "cleanup"} { log_error "Local cleanup shouldn't call pass, fail or skip" return } # _test_fini will handle cleanup and print the success message. _test_fini 0 } ################################################################ # # NAME # fail_on_error - fails if exit_code is non-zero # # SYNOPSIS # fail_on_error message # # DESCRIPTION # If exit_code global variable is non-zero it calls fail with the # message. # # NOTE # DO NOT call this within your local cleanup procedure. # ################################################################ proc fail_on_error { message } { global exit_code if {$exit_code != 0} { fail "$message" } } ################################################################ # # NAME # subpass - registers a passing subtest result # # SYNOPSIS # subpass ?description? # # DESCRIPTION # Increments the subtest pass count and logs a passing subtest message # # ARGUMENTS # description # A single-line string describing the subtest being verified # ################################################################ proc subpass args { global subtest_fail_count subtest_pass_count subtest_skip_count set description "" set argument_count [llength $args] if {$argument_count == 1} { set args [lassign $args description] } if {$argument_count > 1} { fail "Too many arguments ($argument_count): $args" } set subtest_count [expr $subtest_pass_count + $subtest_fail_count + $subtest_skip_count + 1] incr subtest_pass_count set message "Subtest $subtest_count passed" if {$description ne ""} { append message ": $description" } log_info $message } ################################################################ # # NAME # subfail - registers a failing subtest result # # SYNOPSIS # subfail ?options? ?description? ?diagnostics? # # DESCRIPTION # Increments the subtest failure count and logs a failing subtest message # # OPTIONS # -fatal # Causes this subtest failure to be fatal, ending the test # ARGUMENTS # description # A single-line string describing the condition being verified # diagnostics # A string providing additional diagnostic information that will # be included with the log message # # ENVIRONMENT # testsuite_subtest_fatal # Specifies whether first failing subtest aborts the test # ################################################################ proc subfail args { global subtest_fail_count subtest_pass_count subtest_skip_count global testsuite_subtest_fatal set description "" set fatal false while {[llength $args]} { switch -glob -- [lindex $args 0] { -fatal {set fatal true; set args [lrange $args 1 end]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count >= 1} { set args [lassign $args description] } set subtest_count [expr $subtest_pass_count + $subtest_fail_count + $subtest_skip_count + 1] incr subtest_fail_count set message "Subtest $subtest_count failed" if {$description ne ""} { append message ": $description" } if [llength $args] { append message " (" [join $args ", "] ")" } if {$fatal || $testsuite_subtest_fatal} { fail $message } else { log_error $message } } ################################################################ # # NAME # subskip - registers a skipped subtest result # # SYNOPSIS # subskip ?options? ?description? # # DESCRIPTION # Increments the subtest skip count and logs a skipped subtest message # # OPTIONS # -count NUMBER # When used with -skip, indicates the number of subtests that # were skipped # ARGUMENTS # description # A single-line string describing the reason the subtest is # being skipped # ################################################################ proc subskip args { global subtest_fail_count subtest_pass_count subtest_skip_count set description "" set count 1 while {[llength $args]} { switch -glob -- [lindex $args 0] { -count {set args [lassign $args - count]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count == 1} { set args [lassign $args description] } if {$argument_count > 1} { fail "Too many arguments ($argument_count): $args" } set subtest_count [expr $subtest_pass_count + $subtest_fail_count + $subtest_skip_count + 1] incr subtest_skip_count $count if {$count > 1} { set message "$count subtests skipped" } else { set message "Subtest $subtest_count skipped" } if {$description ne ""} { append message ": $description" } log_warn $message } ################################################################ # # NAME # print_time - prints the current date and time # # SYNOPSIS # print_time # ################################################################ proc print_time { } { global bin_date spawn $bin_date expect { eof { wait } } return } ################################################################ # # NAME # dict_getdef - 'dict get' with ability to specify the default value # # SYNOPSIS # dict_getdef dictionary_value key default_value # # DESCRIPTION # Tcl < 8.7 lacks a built in 'dict get' with ability to specify the # default value. Tcl 8.7 adds a dict getdef. # This proc returns the value from the dictionary corresponding to the # keys if it exists, or the default value otherwise. # # EXAMPLE # dict_getdef $option_dict action "warn" # # SOURCE # https://core.tcl-lang.org/tips/doc/trunk/tip/342.md # https://core.tcl-lang.org/tcl/tktview/2370575 # ################################################################ proc dict_getdef {D args} { if {[dict exists $D {*}[lrange $args 0 end-1]]} then { dict get $D {*}[lrange $args 0 end-1] } else { lindex $args end } } ################################################################ # # NAME # _line_trace - returns an abbreviated call stack trace with line numbers # # SYNOPSIS # _line_trace # ################################################################ proc _line_trace {} { set line_trace "" set first_entry true for {set f [expr [info frame] - 3]} {$f >= 1} {incr f -1} { set frame_dict [info frame $f] if [dict exists $frame_dict file] { if [regexp uplevel [dict get $frame_dict cmd]] { continue } if {$first_entry} { set first_entry false } else { append line_trace "," } if [dict exists $frame_dict proc] { set proc [namespace tail [dict get $frame_dict proc]] if {$proc ne ""} { append line_trace "$proc\@" } } append line_trace [file tail [dict get $frame_dict file]] if [dict exists $frame_dict line] { append line_trace ":[dict get $frame_dict line]" } } } return $line_trace } ################################################################ # # NAME # run_command - executes a command and returns a dictionary result # # SYNOPSIS # run_command ?options? command # # DESCRIPTION # Executes a command and returns a dictionary that includes the output, # exit code, etc. An action can be taken (fail, warn, none) if the # command's exit code or timeout is unexpected. By default, the action # will be applied if the command fails. If the -xfail option is # specified, the behavior will be reversed to apply the action if the # command ran successfully. # # OPTIONS # -fail # If the exit code or timeout is unexpected, the action that will # be taken is to fail the test # -warn # If the exit code or timeout is unexpected, the action that will # be taken is to log a warning (this is the default) # -none # If the exit code or timeout is unexpected, no action will be # taken # -xfail # If the command exits with zero and does not time out, the # action will be applied. Without this option, the action will # be applied if the command exits with a non-zero exit code or # times out. # -timeout # Time in seconds to wait for the command to complete before # timing out (default is 60.0) # -nolog # Logging for this command will occur at trace threshold only # # ARGUMENTS # command # a string containing the command and arguments to execute # # RETURN VALUE # A dictionary containing the following elements: # command - The command that was invoked # exit_code - Exit code # output - The combined standard output and standard error # start_time - The time (with ms) the command was executed # duration - The duration (seconds and milliseconds) the # command took to run # ################################################################ proc run_command args { global bin_bash set exit_status 0 set output "" set action "warn" set timeout 60 set expect_failure false set log_at_trace_level false while {[llength $args]} { switch -glob -- [lindex $args 0] { -fail {set action "fail"; set args [lrange $args 1 end]} -none {set action "none"; set args [lrange $args 1 end]} -timeout {set args [lassign $args - timeout]} -warn {set action "warn"; set args [lrange $args 1 end]} -xfail {set expect_failure true; set args [lrange $args 1 end]} -nolog {set log_at_trace_level true; set args [lrange $args 1 end]} -* {fail "Unknown option: [lindex $args 0]"} default break } } if {[llength $args] == 1} { lassign $args command } else { fail "Invalid number of arguments [llength $args]: $args" } if {$log_at_trace_level} { interp alias {} log_alias {} log_trace } else { interp alias {} log_alias {} log_debug } set orig_log_user [log_user -info] log_user 0 log_alias "Invoking command \"$command\"" set start_clock_ms [clock milliseconds] set stty_init raw ; # Prevent the terminal from inserting \r set expect_pid [spawn -noecho $bin_bash -c "$command"] expect { -re "(.+)" { append output $expect_out(1,string) exp_continue } timeout { set message "Command \"$command\" timed out after $timeout seconds" slow_kill $expect_pid set exit_status 1 } eof { lassign [wait] pid spawnid os_error_flag errno set exit_status [expr $errno > 128 ? $errno - 256 : $errno] } } set start_time [format "%.3f" [expr $start_clock_ms / 1000.000]] set start_hms [format "%s.%03d" [clock format [expr int($start_clock_ms / 1000.000)] -format %H:%M:%S] [expr int(fmod($start_clock_ms, 1000))]] set end_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] set duration [format "%.3f" [expr $end_time - $start_time]] log_alias "Command Results:" log_alias " Start Time: $start_hms" log_alias " Duration: $duration" log_alias " Exit Code: $exit_status" if {[info exists output]} { log_alias " Output: $output" } if {! $expect_failure && $exit_status != 0} { cset message "Command \"$command\" failed with rc=$exit_status" if {[info exists output] && $output != ""} { append message ": [string trimright $output]" } if {$action eq "warn"} { log_warn $message } elseif {$action eq "fail"} { fail $message } } elseif {$expect_failure && $exit_status == 0} { cset message "Command \"$command\" was expected to fail but succeeeded" if {$action eq "warn"} { log_warn $message } elseif {$action eq "fail"} { fail $message } } log_user $orig_log_user dict set result command $command dict set result exit_code $exit_status dict set result output $output dict set result start_time $start_time dict set result duration $duration return $result } ################################################################ # # NAME # run_command_output - executes a command and returns the output # # SYNOPSIS # run_command_output ?options? command # # DESCRIPTION # Executes a command and returns a dictionary that includes the output, # exit code, etc. An action can be taken (fail, warn, none) if the # command's exit code or timeout is unexpected. By default, the action # will be applied if the command fails. If the -xfail option is # specified, the behavior will be reversed to apply the action if the # command ran successfully. # # OPTIONS # -fail # if the exit code or timeout is unexpected, the action that will # be taken is to fail the test # -warn # if the exit code or timeout is unexpected, the action that will # be taken is to log a warning (this is the default) # -none # if the exit code or timeout is unexpected, no action will be # taken # -xfail # if the command exits with zero and does not time out, the # action will be applied. Without this option, the action will # be applied if the command exits with a non-zero exit code or # times out. # -timeout # time in seconds to wait for the command to complete before # timing out (default is 60.0) # # ARGUMENTS # command # a string containing the command and arguments to execute # # RETURN VALUE # A string containing the combined standard output and standard error # ################################################################ proc run_command_output args { global exit_code set result [run_command {*}$args] if [dict exists $result output] { return [dict get $result output] } else { return "" } } ################################################################ # # NAME # run_command_status - executes a command and returns the exit code # # SYNOPSIS # run_command_status ?options? command # # DESCRIPTION # Executes a command and returns a dictionary that includes the output, # exit code, etc. An action can be taken (fail, warn, none) if the # command's exit code or timeout is unexpected. By default, the action # will be applied if the command fails. If the -xfail option is # specified, the behavior will be reversed to apply the action if the # command ran successfully. # # OPTIONS # -fail # if the exit code or timeout is unexpected, the action that will # be taken is to fail the test # -warn # if the exit code or timeout is unexpected, the action that will # be taken is to log a warning (this is the default) # -none # if the exit code or timeout is unexpected, no action will be # taken # -xfail # if the command exits with zero and does not time out, the # action will be applied. Without this option, the action will # be applied if the command exits with a non-zero exit code or # times out. # -timeout # time in seconds to wait for the command to complete before # timing out (default is 60.0) # # ARGUMENTS # command # a string containing the command and arguments to execute # # RETURN VALUE # The exit code for the invoked command # ################################################################ proc run_command_status args { global exit_code set result [run_command {*}$args] return [dict get $result exit_code] } ################################################################ # # NAME # cancel_job - cancels the specified job list # # SYNOPSIS # cancel_job job_id_list ?het_job? # # ARGUMENTS # job_id_list # The list of Slurm job ids that we want to cancel # het_job # 1 if jobs are hetjobs and we want to confirm each # component has completed # # RETURN VALUE # RETURN_SUCCESS if jobs are cancelled, or non-zero value otherwise. # ################################################################ proc cancel_job { job_id_list {het_job 0}} { global scancel foreach job_id $job_id_list { if {$job_id == 0} { log_warn "Invalid job_id argument value ($job_id)" return $::RETURN_ERROR } log_debug "Cancelling $job_id" if {[run_command_status "$scancel -Q $job_id"]} { log_warn "scancel command returned error" return $::RETURN_ERROR } if {[wait_for_job $job_id "DONE" $het_job]} { log_warn "Job $job_id not ended" return $::RETURN_ERROR } } return $::RETURN_SUCCESS } ################################################################ # # NAME # get_line_cnt - returns the size of the specified file # # SYNOPSIS # get_line_cnt file_name # # RETURN VALUE # Number of lines in the specified file. # ################################################################ proc get_line_cnt { file_name } { global bin_wc number set lines 0 spawn $bin_wc -l $file_name expect { -re "($number) " { set lines $expect_out(1,string) exp_continue } eof { wait } } return $lines } ################################################################ # # NAME # slow_kill - kills a process slowly # # SYNOPSIS # slow_kill pid # # DESCRIPTION # Kill a process slowly, first trying SIGINT, pausing for # a second, then sending SIGKILL. # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc slow_kill { pid } { global bin_kill catch {exec $bin_kill -INT $pid} catch {exec $bin_kill -INT $pid} sleep 1 catch {exec $bin_kill -KILL $pid} return 0 } ################################################################ # # NAME # get_my_id - gets the id from the running user # # SYNOPSIS # get_my_id # # RETURN VALUE # output of id # ################################################################ proc get_my_id {} { global bin_id number set login_info -1 log_user 0 spawn $bin_id expect { -re "(uid=.*\n)" { set login_info $expect_out(1,string) exp_continue } eof { wait } } log_user 1 if {$login_info == -1} { fail "Unable to get user info" } return $login_info } ################################################################ # # NAME # get_my_user_name - gets the name from the running user # # SYNOPSIS # get_my_user_name # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc get_my_user_name { } { global bin_id re_word_str set user_name -1 log_user 0 spawn $bin_id -nu expect { -re "($re_word_str)" { set user_name $expect_out(1,string) exp_continue } eof { wait } } log_user 1 if {$user_name == -1} { fail "Unable to get user name" } return $user_name } ################################################################ # # NAME # get_my_uid - gets the uid from the running user # # SYNOPSIS # get_my_uid # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc get_my_uid { } { global bin_id number set uid -1 log_user 0 spawn $bin_id -u expect { -re "($number)" { set uid $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $uid } ################################################################ # # NAME # get_my_gid - gets the gid from the running user # # SYNOPSIS # get_my_gid # # RETURN VALUE # A non-zero return code indicates a failure. # ################################################################ proc get_my_gid { } { global bin_id number set gid -1 log_user 0 spawn $bin_id -g expect { -re "($number)" { set gid $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $gid } ################################################################ # # NAME # kill_salloc - kills all salloc commands associated with this user # # SYNOPSIS # kill_salloc # # DESCRIPTION # Kill all salloc commands associated with this user. # Issue two SIGINT, sleep 1 and a SIGKILL # # RETURN VALUE # A non-zero return code indicates a failure. # # NOTE # Use slow_kill instead of kill_salloc if you can capture # the process id # ################################################################ proc kill_salloc { } { global bin_id bin_pkill bin_sleep number set uid [get_my_uid] catch {exec $bin_pkill -INT -u $uid salloc} catch {exec $bin_pkill -INT -u $uid salloc} sleep 1 catch {exec $bin_pkill -KILL -u $uid salloc} return 0 } ################################################################ # # NAME # kill_srun - kills all srun commands associated with this user # # SYNOPSIS # kill_srun # # DESCRIPTION # Kill all srun commands associated with this user. # Issue two SIGINT, sleep 1 and a SIGKILL # # RETURN VALUE # A non-zero return code indicates a failure. # # NOTE # Use slow_kill instead of kill_srun if you can capture # the process id # ################################################################ proc kill_srun { } { global bin_id bin_pkill bin_sleep number set uid [get_my_uid] catch {exec $bin_pkill -INT -u $uid srun} catch {exec $bin_pkill -INT -u $uid srun} sleep 1 catch {exec $bin_pkill -KILL -u $uid srun} return 0 } ################################################################ # # NAME # wait_for - generic wait utility # # SYNOPSIS # wait_for ?options? condition body # # DESCRIPTION # Generic wait utility allowing you to repeatedly execute a generic block # of code until a specified boolean expression is met. The code block and # condition check occur every poll interval until a timeout is reached. # # OPTIONS # -fail # abort the test with failure if the condition is not met # -timeout # time in seconds to wait for the command to complete before # timing out (default is 60.0) # -pollinterval # time in seconds between each loop execution and condition check # (default is 1.0) # # ARGUMENTS # condition # The boolean expression to test # body # A block of code to evaluate in the invoking stack frame # # RETURN VALUE # RETURN_SUCCESS if the condition is met before the timeout occurs, # RETURN_TIMEOUT if the timeout occurs before the condition is met # ################################################################ proc wait_for args { set fatal false set timeout 60 set poll_interval 1 while {[llength $args]} { switch -glob -- [lindex $args 0] { -fail {set fatal true; set args [lrange $args 1 end]} -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } if {[llength $args] == 2} { lassign $args condition body } else { fail "Invalid number of arguments [llength $args]: $args" } set start_time [format "%.3f" [expr [clock milliseconds] / 1000.000]] log_debug "Waiting for $condition starting at [clock format [expr int($start_time)] -format %Y-%m-%dT%X].[lindex [split $start_time '.'] 1]" while {1} { # Evaluate code block log_trace "Evaluating code block ([string trim $body])" set ret [catch {uplevel $body} result] # Check condition if {[uplevel expr [format "{%s}" $condition]]} { set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] log_debug "Condition ($condition) was met" return $::RETURN_SUCCESS } else { log_trace "Condition ($condition) was not met" } # Sleep poll interval log_trace "Sleeping for $poll_interval seconds" after [expr {int($poll_interval * 1000)}] # Check if we have surpassed our timeout set now [format "%.3f" [expr [clock milliseconds] / 1000.000]] log_trace "Checking whether the current time ([clock format [expr int($now)] -format %Y-%m-%dT%X].[lindex [split $now '.'] 1]) is greater than the start time plus the timeout ([clock format [expr int($start_time + $timeout)] -format %Y-%m-%dT%X].[lindex [split [expr $start_time + $timeout] '.'] 1])" if {$now > $start_time + $timeout} { set message "Condition ($condition) did not occur before timeout ($timeout) seconds" if {$fatal} { fail $message } else { log_warn $message return $::RETURN_TIMEOUT } } } } ################################################################ # # NAME # wait_for_command - waits for command output to match a pattern # # SYNOPSIS # wait_for_command ?options? command args regex ?matches_in? ?or_more? ?matches_out? # # DESCRIPTION # Executes a command every poll interval until a regex # pattern is matched in the output, or until timeout. # # OPTIONS # -timeout # time in seconds to wait for the command to complete before # timing out (default is 120) # -pollinterval # time in seconds between each loop execution and condition check # (default is 1) # # ARGUMENTS # command # The command to run via spawn. # args # The arguments to the command, as a single string. # regex # The regex pattern to search for in the command # output. Can be a simple string. # matches_in # The number of times to match the regex. Defaults to 1. # or_more # If 1, allow for matching the regex pattern match_cnt *or more* # times, instead of exactly match_cnt times. Defaults to 0. # matches_out # The upvar (a variable name to "pass by reference" in TCL) # to set/return the number of matches found. # Useful if or_more is 1 and the caller wants the matches found. # # RETURN VALUE # RETURN_SUCCESS on success and RETURN_TIMEOUT on timeout. # On failure, an error is logged to the output. # If matches_out is specified, the number of matches will # be returned via the reference/upvar matches_out. # ################################################################ proc wait_for_command args { global bin_sleep set timeout 120 set poll_interval 1 set matches_in 1 set or_more 0 set matches_out "" while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count < 3} { fail "Too few arguments ($argument_count): $args" } else { lassign $args command arguments regex } if {$argument_count >= 4} { set matches_in [lindex $args 3] } if {$argument_count >= 5} { set or_more [lindex $args 4] } if {$argument_count == 6} { set matches_out [lindex $args 5] } if {$argument_count > 6} { fail "Too many arguments ($argument_count): $args" } if {$matches_out != ""} { upvar $matches_out matches } set delay 0 while {$delay < $timeout} { set matches 0 # `{*}` breaks apart a string into individual pieces spawn $command {*}$arguments expect { -re $regex { incr matches exp_continue } timeout { log_error "$command not responding after $delay seconds polling" return $::RETURN_TIMEOUT } eof { wait } } if {($matches == $matches_in) || ($or_more == 1 && $matches >= $matches_in)} { return $::RETURN_SUCCESS } log_debug "Polled $matches matches of '$regex', but expecting $matches_in" exec $bin_sleep $poll_interval incr delay $poll_interval } if {$or_more == 1} { set match_str "$matches_in or more times" } elseif {$matches_in == 1} { set match_str "exactly $matches_in time" } else { set match_str "exactly $matches_in times" } log_error "Failed to match regex `$regex` $match_str after $timeout seconds for command `$command $arguments`." return $::RETURN_TIMEOUT } ################################################################ # # NAME # wait_for_file - waits for a file to exist with non-zero size # # SYNOPSIS # wait_for_file ?options? file_name # # OPTIONS # -timeout # time in seconds to wait for the file to exist before # timing out (default is 90) # -pollinterval # time in seconds between each file existence test (default is 1) # # DESCRIPTION # Wait for the specified file to exist and have a non-zero size. # Note that if JobFileAppend=0 is configured, a file can exist and # be purged then be re-created. # # RETURN VALUE # RETURN_SUCCESS if job reaches the desired state, or non-zero value # otherwise. # ################################################################ proc wait_for_file args { global bin_sleep set timeout 90 set poll_interval 1 while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count != 1} { fail "Invalid number of arguments ($argument_count): $args" } else { lassign $args file_name } for {set my_delay 0} {$my_delay <= $timeout} \ {set my_delay [expr $my_delay + $poll_interval]} { if {[file exists $file_name]} { # Add small delay for I/O buffering exec $bin_sleep 1 return $::RETURN_SUCCESS } exec $bin_sleep $poll_interval # Expect may fail to load current NFS info. # Use the ls command to load current info. set slash_pos [string last $file_name "/"] if {$slash_pos < 1} { set dir_name "." } else { decr slash_pos set dir_name [string $file_name 0 $slash_pos] } exec /bin/ls $dir_name } log_error "Timeout waiting for file $file_name" return $::RETURN_TIMEOUT } ################################################################ # # NAME # wait_for_job - waits for job to be in desired state # # SYNOPSIS # wait_for_job ?options? job_id desired_state ?het_job? # # DESCRIPTION # Wait for job to be in desired state. Can handle het job components. # # OPTIONS # -timeout # time in seconds to wait for the job to be in the desired state # before timing out (default is 90) # -pollinterval # time in seconds between each job state check (default is 1) # # ARGUMENTS # job_id # The Slurm job id of a job we want to wait for. # desired_state # The state you want the job to attain before returning. # Currently supports: # DONE any terminated state # PENDING job is pending # RUNNING job is running # SPECIAL_EXIT # SUSPENDED job is suspended # het_job # If set, checks the state of each component job if the job # is a het one. # # RETURN VALUE # RETURN_SUCCESS if job reaches the desired state, or non-zero value # otherwise. # # SEE ALSO # _wait_for_single_job # ################################################################ proc wait_for_job args { set options [list] set het_job 0 while {[llength $args]} { switch -glob -- [lindex $args 0] { -* { lappend options {*}[lrange $args 0 1] set args [lrange $args 2 end] } default break } } set argument_count [llength $args] if {$argument_count < 2} { fail "Too few arguments ($argument_count): $args" } elseif {$argument_count > 3} { fail "Too many arguments ($argument_count): $args" } else { lassign $args job_id desired_state } if {$argument_count == 3} { set hetjob [lindex $args 2] } if { $het_job } { # get component job ids set jid_list [get_het_job_ids $job_id 1] } set rc 0 set jid_list "" if { $jid_list == "" } { # non-het job set jid_list $job_id } foreach jid $jid_list { set rc [_wait_for_single_job {*}$options $jid $desired_state] if { $rc } { # bail out on first failure break } } return $rc } ################################################################ # # NAME # wait_for_account_done - cancels and waits on jobs in specified accounts # # SYNOPSIS # wait_for_account_done ?options? accounts # # DESCRIPTION # Cancel jobs on and wait for them to be finished in account(s) given. # # OPTIONS # -timeout # time in seconds to wait for the jobs to be finished before # timing out (default is 360) # -pollinterval # time in seconds between each job state check (default is 1) # # ARGUMENTS # accounts # Comma-delimited list of accounts # # RETURN VALUE # RETURN_SUCCESS if all jobs of the account are finished, or non-zero # otherwise. # # NOTE # We sleep for two seconds before replying that a job is # done to give time for I/O completion (stdout/stderr files) # ################################################################ proc wait_for_account_done args { global scancel squeue re_word_str set timeout 360 set poll_interval 1 while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count != 1} { fail "Invalid number of arguments ($argument_count): $args" } else { lassign $args accounts } if { $accounts == "" } { log_error "No account given" return $::RETURN_ERROR } log_user 0 set account_list [split $accounts ","] foreach item $account_list { spawn $scancel -A $item expect { timeout { log_warn "No response from scancel" } eof { wait } } } set my_delay 0 while 1 { set found 0 spawn $squeue -o Account=%a -h -A$accounts expect { -re "Account=($re_word_str)" { set found 1 exp_continue } eof { wait } } if { !$found } { log_debug "Account(s) $accounts is/are empty" break } if { $my_delay > $timeout } { log_error "Timeout waiting for account(s) '$accounts' to be finished" log_user 1 return $::RETURN_TIMEOUT } exec sleep $poll_interval set my_delay [expr $my_delay + $poll_interval] } log_user 1 return $::RETURN_SUCCESS } ################################################################ # # NAME # wait_for_part_done - cancels and waits on jobs in specified partition # # SYNOPSIS # wait_for_part_done ?options? partition # # DESCRIPTION # Cancel jobs on and wait for them to be finished in partition given. # # OPTIONS # -timeout # time in seconds to wait for the jobs to be finished before # timing out (default is 360) # -pollinterval # time in seconds between each job state check (default is 1) # # ARGUMENTS # partition # partition name # # RETURN VALUE # RETURN_SUCCESS if all jobs of the partition are finished, or non-zero # otherwise. # # NOTE # We sleep for two seconds before replying that a job is # done to give time for I/O completion (stdout/stderr files) # ################################################################ proc wait_for_part_done args { global scancel squeue re_word_str set timeout 360 set poll_interval 1 while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count != 1} { fail "Invalid number of arguments ($argument_count): $args" } else { lassign $args partition } if { $partition == "" } { log_error "No partition given" return $::RETURN_ERROR } log_user 0 spawn $scancel -p $partition expect { timeout { log_error "No response from scancel" } eof { wait } } set my_delay 0 while 1 { set found 0 spawn $squeue -o Part=%P -h -p$partition expect { -re "Part=($re_word_str)" { set found 1 exp_continue } eof { wait } } if { !$found } { log_debug "Partition $partition is empty" break } if { $my_delay > $timeout } { log_error "Timeout waiting for partition '$partition' to be finished" log_user 1 return $::RETURN_TIMEOUT } exec sleep $poll_interval set my_delay [expr $my_delay + $poll_interval] } log_user 1 return $::RETURN_SUCCESS } ################################################################ # # NAME # wait_for_step - waits for a job step to be found # # SYNOPSIS # wait_for_step ?options? step_id # # DESCRIPTION # Wait for a job step to be found. # # OPTIONS # -timeout # time in seconds to wait for the job step to be found before # timing out (default is 360) # -pollinterval # time in seconds between each step existence check (default is 1) # # ARGUMENTS # step_id # job step id # # RETURN VALUE # RETURN_SUCCESS if step_id is found, or non-zero otherwise. # ################################################################ proc wait_for_step args { global scontrol set timeout 360 set poll_interval 1 while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count != 1} { fail "Invalid number of arguments ($argument_count): $args" } else { lassign $args step_id } set my_delay 0 while 1 { set fd [open "|$scontrol -o show step $step_id"] gets $fd line catch {close $fd} if {[regexp {Nodes=} $line foo] == 1} { return $::RETURN_SUCCESS } if {[regexp {MidplaneList=} $line foo] == 1} { return $::RETURN_SUCCESS } if { $my_delay > $timeout } { log_error "Timeout waiting for job step" return $::RETURN_TIMEOUT } log_debug "Step $step_id not done yet. Waiting for $poll_interval seconds" exec sleep $poll_interval set my_delay [expr $my_delay + $poll_interval] } } ################################################################ # # NAME # wait_for_all_jobs - waits for jobs to finish having a specified name # # SYNOPSIS # wait_for_all_jobs ?options? job_name # # DESCRIPTION # Wait for previously submitted Slurm jobs to finish of a certain name. # # OPTIONS # -timeout # time in seconds to wait for the jobs to finish before # timing out (default is 30) # -pollinterval # time in seconds between each job state check (default is 1) # # ARGUMENTS # job_name # The name of job to wait for # # RETURN VALUE # RETURN_SUCCESS if all jobs with the specified name are finished, # or non-zero otherwise. # If jobs are not completed after timeout, they are cancelled. # ################################################################ proc wait_for_all_jobs args { global scancel squeue set timeout 30 set poll_interval 1 while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count != 1} { fail "Invalid number of arguments ($argument_count): $args" } else { lassign $args job_name } log_debug "Waiting for all jobs to terminate" set jobs_found -1 set jobs_desired 0 wait_for -timeout $timeout -pollinterval $poll_interval {$jobs_found == $jobs_desired} { set jobs_found 0 log_user 0 spawn $squeue -o %j -n $job_name expect { -re "$job_name" { incr jobs_found exp_continue } timeout { fail "No response from squeue" } eof { wait } } log_user 1 if {$jobs_found != $jobs_desired} { log_debug "Still $jobs_found jobs remaining" } } if {$jobs_found == $jobs_desired} { log_debug "All jobs complete" return $::RETURN_SUCCESS } else { log_debug "Cancelling uncompleted jobs" run_command "$scancel -n $job_name" return $::RETURN_ERROR } } ################################################################ # # NAME # wait_job_reason - waits for a desired job state and reason # # SYNOPSIS # wait_job_reason ?options? job_id ?desired_state? ?desired_reason_list? # # DESCRIPTION # Wait until the job is in desired state and reason is one # of the desired ones or until the timeout. # # OPTIONS # -timeout # time in seconds to wait for the job state and reason before # timing out (default is 360) # -pollinterval # time in seconds between each job state check (default is 1) # # ARGUMENTS # job_id # The job to wait for # desired_state # Desired state. # desired_reason_list # List of desired reasons. Empty list means that any reason # is ok. # # RETURN VALUE # RETURN_SUCCESS when job is in the desired state and reason is one # of the desired ones, or non-zero otherwise. # ################################################################ proc wait_job_reason args { global scontrol re_word_str set final_state "COMPLETED CANCELLED FAILED TIMEOUT DEADLINE OUT_OF_MEMORY" set timeout 360 set poll_interval 1 set desired_state "PENDING" set desired_reason_list "" while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count < 1} { fail "Too few arguments ($argument_count): $args" } else { lassign $args job_id } if {$argument_count >= 2} { set desired_state [lindex $args 1] } if {$argument_count == 3} { set desired_reason_list [lindex $args 2] } if {$argument_count > 3} { fail "Too many arguments ($argument_count): $args" } set log_user_prev [log_user -info] log_user 0 set my_delay 0 set rc $::RETURN_ERROR while true { set pending 0 set has_reason 1 spawn $scontrol show job $job_id expect { -re "JobState=($re_word_str) Reason=(\\S+)" { set job_state $expect_out(1,string) set job_reason $expect_out(2,string) } timeout { log_error "No response from scontrol show job" set rc $::RETURN_TIMEOUT break } } # Check if both state and reason are the desired ones if {$job_state == $desired_state} { set found 0 if {$desired_reason_list == ""} { set found 1 } foreach desired_reason $desired_reason_list { if {$job_reason == $desired_reason } { set found 1 } } if {$found} { set rc $::RETURN_SUCCESS break } } elseif {[lsearch -exact final_state $job_state] >= 0} { # Job is in final step no need to wait longer log_error [format "Job in final state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \ $job_state $job_reason \ $desired_state $desired_reason_list] set rc $::RETURN_ERROR break } # Check if this was the last poll if {$my_delay > $timeout} { log_error "Timeout" set rc $::RETURN_TIMEOUT break } set remamining_sec [expr $timeout - $my_delay] log_debug [format "Job in state/reason '%s' / '%s' instead of the desired '%s' / '%s'." \ $job_state $job_reason \ $desired_state $desired_reason_list] log_debug [format "Polling again in %ss, %ss to timeout." \ $poll_interval $remamining_sec] sleep $poll_interval set my_delay [expr $my_delay + $poll_interval] } log_user $log_user_prev return $rc } ################################################################ # # NAME # get_config - returns a dictionary of slurm configuration parameters # # SYNOPSIS # get_config # # DESCRIPTION # Uses `scontrol show config` to return the slurm configuration as # a dictionary of parameter values. # ################################################################ proc get_config { } { global scontrol set output [run_command_output -fail -nolog "$scontrol show config"] foreach line [split $output "\n"] { if {[regexp {^(\w+) += (.*)$} $line {} param_name param_value] == 1} { dict set config_dict $param_name $param_value } } return $config_dict } ################################################################ # # NAME # get_config_param - returns a slurm configuration parameter # # SYNOPSIS # get_config_param parameter_name # # DESCRIPTION # Returns a specific configuration parameter value. # # RETURN VALUE # Returns the value of the specified parameter or MISSING if it does not # exist. # ################################################################ proc get_config_param { parameter_name } { set config_dict [get_config] if [dict exists $config_dict $parameter_name] { return [dict get $config_dict $parameter_name] } else { return "MISSING" } } ################################################################ # # NAME # param_contains - test whether a comma-separated-list contains a specified value # # SYNOPSIS # param_contains haystack needle # # DESCRIPTION # Searches for the specified value (needle) in the comma-separated-list # string (haystack). Needle can be a glob-style pattern. # # RETURN VALUE # Returns a boolean value indicating whether the value (needle) was found # in the comma-separated-list string (haystack) # ################################################################ proc param_contains { haystack needle } { if {[lsearch [split $haystack ","] $needle] != -1} { return true } else { return false } } ################################################################ # # NAME # get_affinity_types - gets the task plugins running with task/ stripped # # SYNOPSIS # get_affinity_types # # RETURN VALUE # Returns comma separated list of task plugins running without the task/ # ################################################################ proc get_affinity_types { } { global scontrol re_word_str log_user 0 set affinity "" spawn $scontrol show config expect { -re "TaskPlugin *= ($re_word_str)" { set parts [split $expect_out(1,string) ",/"] while 1 { set task_found [lsearch $parts "task"] if { $task_found == -1 } break set parts [lreplace $parts $task_found $task_found] } set affinity [join $parts ","] exp_continue } eof { wait } } log_user 1 return $affinity } ################################################################ # # NAME # get_mps_count_by_index - gets the count of a specific gres/mps device # # SYNOPSIS # get_mps_count_by_index index hostname # # RETURN VALUE # Returns the Count of a specific gres/mps device # ################################################################ proc get_mps_count_by_index { index hostname } { global slurmd number re_word_str log_user 0 set count 0 spawn $slurmd -G -N $hostname expect { -re "Gres Name=mps Type=$re_word_str Count=($number) Index=$index" { set count $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $count } ################################################################ # # NAME # get_bb_types - gets the burst buffer plugins running with task/ stripped # # SYNOPSIS # get_bb_types # # RETURN VALUE # Returns comma separated list of task plugins running without the task/ # ################################################################ proc get_bb_types { } { global scontrol re_word_str log_user 0 set bb_types "" spawn $scontrol show config expect { -re "BurstBufferType *= ($re_word_str)" { set parts [split $expect_out(1,string) ",/"] while 1 { set task_found [lsearch $parts "burst_buffer"] if { $task_found == -1 } break set parts [lreplace $parts $task_found $task_found] } set bb_types [join $parts ","] exp_continue } eof { wait } } log_user 1 return $bb_types } ################################################################ # # NAME # priority_type - gets the priority plugin type # # SYNOPSIS # priority_type # # DESCRIPTION # Use scontrol to determine the priority plugin # # RETURN VALUE # Name of priority type # ################################################################ proc priority_type {} { global scontrol log_user 0 set name "" set fd [open "|$scontrol show config"] while {[gets $fd line] != -1} { if {[regexp {^PriorityType *= priority/(\w+)} $line frag name] == 1} { break } } catch {close $fd} log_user 1 if {[string length $name] == 0} { log_error "Could not identify the Priority Type" } return $name } ################################################################ # # NAME # get_default_acct - gets user's default account # # SYNOPSIS # get_default_acct user # # RETURN VALUE # Returns name of default account if exists, NULL otherwise # ################################################################ proc get_default_acct { user } { global sacctmgr re_word_str bin_id log_user 0 set def_acct "" if { !$user } { set user [get_my_user_name] } spawn $sacctmgr -n list -P user $user format="DefaultAccount" expect { -re "($re_word_str)" { set def_acct $expect_out(1,string) exp_continue } eof { wait } } log_user 1 return $def_acct } ################################################################ # # NAME # get_cycle_count - get desired iteration count # # SYNOPSIS # get_cycle_count # # DESCRIPTION # For tests with iteration counts (e.g. test9.1, test9.2) # return the desired iteration count # # RETURN VALUE # Returns desired iteration count # ################################################################ proc get_cycle_count { } { global enable_memory_leak_debug if {$enable_memory_leak_debug != 0} { return 2 } return 100 } ################################################################ # # NAME # get_select_type_params - determines SelectTypeParameters being used for a given partition # # SYNOPSIS # get_select_type_params ?partition? # # DESCRIPTION # Determine SelectTypeParameters being used for a given partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns a string containing SelectTypeParameters # ################################################################ proc get_select_type_params { {partition ""} } { global scontrol bin_bash bin_grep re_word_str log_user 0 set params "" if {[string length $partition] == 0} { set partition [default_partition] } if {[string compare $partition ""]} { spawn -noecho $bin_bash -c "exec $scontrol show part $partition | $bin_grep SelectTypeParameters" expect { -re "SelectTypeParameters *= *NONE" { exp_continue } -re "SelectTypeParameters *= *($re_word_str)" { set params $expect_out(1,string) exp_continue } eof { wait } } } if { [string compare params ""] } { spawn -noecho $bin_bash -c "exec $scontrol show config | $bin_grep SelectTypeParameters" expect { -re "SelectTypeParameters *= *($re_word_str)" { set params $expect_out(1,string) exp_continue } eof { wait } } } log_user 1 return $params } ################################################################ # # NAME # check_config_select - checks if effectively using the select type # # SYNOPSIS # check_config_select type # # DESCRIPTION # Determine if SelectType is equivalent to the passed one by also # checking other_cons_res and other_cons_tres on SelectTypeParameters # in case that select/cray_aries is configured. # # ARGUMENTS # type # the desired SelectType to check (e.g. cons_tres) # # RETURN VALUE # Returns true if configured, false otherwise # ################################################################ proc check_config_select { type } { set select_type [get_config_param "SelectType"] set select_type_parameters [get_config_param "SelectTypeParameters"] if {$select_type eq "select/$type"} { return true } if {$select_type eq "select/cray_aries"} { if {$type eq "linear" && ![param_contains $select_type_parameters "other_cons_res"] && ![param_contains $select_type_parameters "other_cons_tres"]} { return true } if {$type eq "cons_res" && [param_contains $select_type_parameters "other_cons_res"]} { return true } if {$type eq "cons_tres" && [param_contains $select_type_parameters "other_cons_tres"]} { return true } } return false } ################################################################ # # NAME # get_total_cpus - gets the total amount of CPUs on the default partition # # SYNOPSIS # get_total_cpus # # RETURN VALUE # The total amount of CPUs on the default partition. # # NOTE # CoreSpecCount are not part of the total. # ################################################################ proc get_total_cpus {} { global sinfo scontrol re_word_str exit_code set partition [default_partition] set cpu_cnt 0 set re_with_cs "CPUTot=(\\d+).*CoreSpecCount=(\\d+).*ThreadsPerCore=(\\d+)" set re_without_cs "CPUTot=(\\d+)" spawn $sinfo -h -o "%P %N" -p $partition --state=idle expect { -re "$partition\\* ($re_word_str)" { set def_hostlist $expect_out(1,string) exp_continue } timeout { log_error "sinfo not responding" set exit_code 1 } eof { wait } } set fd [open "|$scontrol --oneliner show node $def_hostlist"] while {[gets $fd line] != -1} { if {[regexp $re_with_cs $line frag tmp_cpu_cnt core_spec_cnt threads_per_core] == 1} { set cpu_cnt [expr $cpu_cnt + $tmp_cpu_cnt - $core_spec_cnt * $threads_per_core] continue } if {[regexp $re_without_cs $line frag tmp_cpu_cnt] == 1} { set cpu_cnt [expr $cpu_cnt + $tmp_cpu_cnt] continue } } return $cpu_cnt } ################################################################ # # NAME # is_super_user - determines if user is root or SlurmUser # # SYNOPSIS # is_super_user # # DESCRIPTION # Determine if user is a Slurm super user (i.e. user # root or configured SlurmUser) # # RETURN VALUE # true is user is root or SlurmUser, false otherwise # ################################################################ proc is_super_user { } { global number set user [get_my_user_name] # Check if user is root if {[string compare $user "root"] == 0} { return true } # Check if user is SlurmUser set slurm_user [get_config_param "SlurmUser"] if {[regexp "${user}\\($number\\)" $slurm_user match]} { return true } return false } ################################################################ # # NAME # dec2hex - creates a 32 bit hex number from a signed decimal number # # SYNOPSIS # dec2hex value # # DESCRIPTION # Create a 32 bit hex number from a signed decimal number # # RETURN VALUE # 32 bit hex version of input 'value' # # SOURCE # Courtesy of Chris Cornish # http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 # ################################################################ # Replace all non-decimal characters proc dec2hex {value} { regsub -all {[^0-x\.-]} $value {} newtemp set value [string trim $newtemp] if {$value < 2147483647 && $value > -2147483648} { set tempvalue [format "%#010X" [expr $value]] return [string range $tempvalue 2 9] } elseif {$value < -2147483647} { return "80000000" } else { return "7FFFFFFF" } } ################################################################ # # NAME # uint2hex - creates a 32 bit hex number from an unsigned decimal # # SYNOPSIS # uint2hex value # # DESCRIPTION # Create a 32 bit hex number from an unsigned decimal number. # # ARGUMENTS # value # unsigneddecimal number to convert # # RETURN VALUE # 32 bit hex version of input 'value' # # SOURCE # Courtesy of Chris Cornish # http://aspn.activestate.com/ASPN/Cookbook/Tcl/Recipe/415982 # ################################################################ # Replace all non-decimal characters proc uint2hex {value} { regsub -all {[^0-x\.-]} $value {} newtemp set value [string trim $newtemp] if {$value <= 4294967295 && $value >= 0} { set tempvalue [format "%#010X" [expr $value]] return [string range $tempvalue 2 9] } else { return "FFFFFFFF" } } ################################################################ # # NAME # available_nodes - returns number of available nodes # # SYNOPSIS # available_nodes ?state? ?partition? # # DESCRIPTION # Check to see if a given partition has a at least "num_nodes" number # of nodes in the alloc, idle, or comp state. This can be used to # avoid launching a job that will never run because nodes are in the # "drained" state or otherwise unavailable. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns the number of available nodes in the partition, or # -1 on failure. # ################################################################ proc available_nodes { {state ""} {partition ""} } { global sinfo if {[string length $partition] == 0} { set partition [default_partition] } if {[string length $state] == 0} { set state "idle,alloc,comp" } set available -1 set fd [open "|$sinfo --noheader --partition $partition --state $state --format %D"] gets $fd line catch {close $fd} regexp {\d+} $line available if {[string match *K $line]} { set available [expr $available * 1024] } elseif {[string match *M $line]} { set available [expr $available * 1048576] } return $available } ################################################################ # # NAME # partition_oversubscribe - determines the oversubscribe configuration of the specified partition # # SYNOPSIS # partition_oversubscribe ?partition? # # DESCRIPTION # Determine the oversubscribe configuration of the specified partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Return the oversubscribe configuration of the specified partition. # ################################################################ proc partition_oversubscribe { {partition ""} } { global sinfo if {[string length $partition] == 0} { set partition [default_partition] } set oversubscribe "NO" log_debug "$sinfo --noheader --partition $partition --format %h" set fd [open "|$sinfo --noheader --partition $partition --format %h"] gets $fd line catch {close $fd} regexp {[a-zA-Z]+} $line oversubscribe return $oversubscribe } ################################################################ # # NAME # default_partition - determines the name of the default partition # # SYNOPSIS # default_partition # # DESCRIPTION # Use scontrol to determine the name of the default partition # # RETURN VALUE # Name of the current default partition # ################################################################ proc default_partition {} { global scontrol set name "" set fd [open "|$scontrol --all --oneliner show partition"] while {[gets $fd line] != -1} { if {[regexp {^PartitionName=([^ ]*).*Default=YES} $line frag name] == 1} { break } } catch {close $fd} if {[string length $name] == 0} { log_error "Could not identify the default partition" } return $name } ################################################################ # # NAME # default_part_exclusive - determines if the default partition allocates whole nodes to jobs # # SYNOPSIS # default_part_exclusive # # DESCRIPTION # Use scontrol to determine if the default partition # allocates whole nodes to jobs # # RETURN VALUE # Name of the current default partition # ################################################################ proc default_part_exclusive {} { set def_part [default_partition] set oversubscribe [partition_oversubscribe $def_part] if {[string compare $oversubscribe "EXCLUSIVE"] == 0} { return 1 } else { return 0 } } ################################################################ # # NAME # switch_type - determines the switch type # # SYNOPSIS # switch_type # # DESCRIPTION # Use scontrol to determine the switch type # # RETURN VALUE # Name of SwitchType # ################################################################ proc switch_type {} { global scontrol set name "" set fd [open "|$scontrol show config"] while {[gets $fd line] != -1} { if {[regexp {^SwitchType *= switch/(\w+)} $line frag name] == 1} { break } } catch {close $fd} if {[string length $name] == 0} { log_error "Could not identify the switch type" } return $name } ################################################################ # # NAME # make_bash_script - creates a bash script # # SYNOPSIS # make_bash_script script_name script_contents # # DESCRIPTION # Create a bash script of name "script_name", and # make the body of the script "script_contents". # make_bash_script removes the file if it already exists, # then generates the #! line, and then dumps "script_contents" # to the file. Finally, it makes certain that the script # is executable. # # ARGUMENTS # script_name # file name for the bash script # script_contents # body of the script, not including the initial #! line. # # RETURN VALUE # Nothing. # ################################################################ proc make_bash_script { script_name script_contents } { global bin_bash bin_chmod file delete $script_name set fd [open $script_name "w"] puts $fd "#!$bin_bash" puts $fd $script_contents close $fd exec $bin_chmod 700 $script_name } ################################################################ # # NAME # get_suffix - given a hostname, returns it's numeric suffix # # SYNOPSIS # get_suffix hostname # # DESCRIPTION # Given a hostname, return it's numeric suffix # # RETURN VALUE # numerical suffix for input 'hostname' or -1 if not a number # ################################################################ proc get_suffix { hostname } { set host_len [string length $hostname] set host_inx [expr $host_len-1] set host_char [string index $hostname $host_inx] if {[string compare $host_char "0"] < 0 || [string compare $host_char "9"] > 0} { return -1 } for {set host_inx [expr $host_len-1]} {$host_inx >= 0} {incr host_inx -1} { set host_char [string index $hostname $host_inx] if {[string compare $host_char "0"] < 0} { break } if {[string compare $host_char "9"] > 0} { break } } incr host_inx if {$host_inx == $host_len} { log_warn "Hostname lacks a suffix: $hostname" return "-1" } # Strip off leading zeros to avoid doing octal arithmetic set suffix [string range $hostname $host_inx $host_len] set suffix_len [string length $suffix] for {set suffix_inx 0} {$suffix_inx < [expr $suffix_len - 1]} {incr suffix_inx} { set suffix_char [string index $suffix $suffix_inx] if {[string compare $suffix_char "0"] != 0} { break } } return [string range $suffix $suffix_inx $suffix_len] } ################################################################ # # NAME # check_acct_associations - checks associations # # SYNOPSIS # check_acct_associations # # DESCRIPTION # Use sacctmgr to check associations # # RETURN VALUE # true if no error is found, false otherwise # ################################################################ proc check_acct_associations { } { global sacctmgr number re_word_str exit_code set rc true log_user 0 log_debug "Sanity-Checking Associations" # # Use sacctmgr to check associations # set s_pid [spawn $sacctmgr -n -p list assoc wopi wopl withd format=lft,rgt,cluster] expect { -re "($number)\\|($number)\\|($re_word_str)\\|" { # Here we are checking if we have duplicates and # setting up an array to check for holes later set cluster $expect_out(3,string) if { ![info exists c_min($cluster)] } { set c_min($cluster) -1 set c_max($cluster) -1 } set num1 $expect_out(1,string) set num2 $expect_out(2,string) set first [info exists found($cluster,$num1)] set sec [info exists found($cluster,$num2)] #log_debug "$first=$num1 $sec=$num2" if { $first } { log_error "$cluster found lft $num1 again" set rc false } elseif { $sec } { log_error "$cluster found rgt $num2 again" set rc false } else { set found($cluster,$num1) 1 set found($cluster,$num2) 1 if { $c_min($cluster) == -1 || $c_min($cluster) > $num1 } { set c_min($cluster) $num1 } if { $c_max($cluster) == -1 || $c_max($cluster) < $num2 } { set c_max($cluster) $num2 } } exp_continue } timeout { log_error "sacctmgr add not responding" slow_kill $s_pid set exit_code 1 } eof { wait } } foreach cluster [array names c_min] { # Here we are checking for holes in the list from above for {set inx $c_min($cluster)} {$inx < $c_max($cluster)} {incr inx} { if { ![info exists found($cluster,$inx)] } { log_error "$cluster No index at $inx" set rc false } } } log_user 1 return $rc } ################################################################ # # NAME # get_job_acct_freq - gets the value of the job account gather frequency # # SYNOPSIS # get_job_acct_freq # # RETURN VALUE # job account gather frequency # ################################################################ proc get_job_acct_freq { } { global scontrol number log_user 0 set freq_val 0 spawn $scontrol show config expect { -re "JobAcctGatherFrequency *= ($number)" { set freq_val $expect_out(1,string) if {$freq_val == 0} { set freq_val 0 } } -re "JobAcctGatherFrequency *= task=($number)" { set freq_val $expect_out(1,string) if {$freq_val == 0} { set freq_val 0 } } eof { wait } } log_user 1 return $freq_val } ################################################################ # # NAME # get_admin_level - gets the AdminLevel for the current user # # SYNOPSIS # get_admin_level # # RETURN VALUE # AdminLevel for the current user # ################################################################ proc get_admin_level { } { global sacctmgr re_word_str re_word_str bin_id exit_code set admin_level "" set user_name "" if {[is_super_user]} { return "Administrator" } set user_name [get_my_user_name] if { ![string length $user_name] } { log_error "No name returned from id" return "" } # # Use sacctmgr to check admin_level # log_user 0 set s_pid [spawn $sacctmgr -n -P list user $user_name format=admin] expect { -re "($re_word_str)" { set admin_level $expect_out(1,string) exp_continue } timeout { slow_kill $s_pid fail "sacctmgr add not responding" } eof { wait } } log_user 1 return $admin_level } ################################################################ # # NAME # get_control_machine - gets the ControlMachine parameter # # SYNOPSIS # get_control_machine # # RETURN VALUE # ControlMachine value # ################################################################ proc get_control_machine { } { global scontrol re_word_str exit_code # # Use scontrol to find the ControlMachine # log_user 0 set control_machine "" set scon_pid [spawn -noecho $scontrol show config] expect { # We need to handle two possible outputs of SlurmctldHost[0]: # a) hostname # b) hostname(IP) -re "SlurmctldHost.0. *= ($re_word_str)" { set par_idx [expr [string last "(" $expect_out(1,string)] -1] if { $par_idx == -2 } { set control_machine $expect_out(1,string) } else { set control_machine [string range $expect_out(1,string) 0 $par_idx] } exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $control_machine } ################################################################ # # NAME # get_node_cnt_in_part - determines how many nodes are in a given partition # # SYNOPSIS # get_node_cnt_in_part ?partition? # # DESCRIPTION # Determine how many nodes are in a given partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns count of nodes in a partition or 0 if unknown. # ################################################################ proc get_node_cnt_in_part { {partition ""} } { global scontrol number if {[string length $partition] == 0} { set partition [default_partition] } log_user 0 set node_cnt 0 set scon_pid [spawn -noecho $scontrol show partition $partition] expect { -re "not found" { log_error "Partition $partition doesn't exist" } -re "TotalNodes=($number)" { set node_cnt $expect_out(1,string) exp_continue } timeout { log_error "scontrol not responding" } eof { } } log_user 1 return $node_cnt } ################################################################ # # NAME # get_idle_node_in_part - gets an idle node in a given partition # # SYNOPSIS # get_idle_node_in_part ?partition? # # DESCRIPTION # Get an idle node in a given partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns name of node in a partition or "" if unknown. # ################################################################ proc get_idle_node_in_part { {partition ""} } { global scontrol sinfo re_word_str if {[string length $partition] == 0} { set partition [default_partition] } log_user 0 set host_list "" spawn $sinfo -oNAME=%N -h -p$partition --state=idle expect { -re "not found" { log_error "Partition $partition doesn't exist" } -re "NAME=($re_word_str)" { set host_list $expect_out(1,string) } timeout { log_error "sinfo not responding" } eof { wait } } set node_name "" spawn $scontrol show hostname $host_list expect { -re "($re_word_str)" { set node_name $expect_out(1,string) } timeout { log_error "scontrol not responding" } eof { wait } } log_user 1 return $node_name } ############################################################### # # NAME # change_subbp_state - sets sub mid plane state # # SYNOPSIS # change_subbp_state node ionodes state # # RETURN VALUE # Returns SUCCESS if state of mid plane is changed # ############################################################### proc change_subbp_state { node ionodes state } { global scontrol smap set return_code 0 set my_pid [spawn $scontrol update subbpname=$node\[$ionodes\] state=$state] expect { -re "slurm_update error:" { set return_code 1 exp_continue } -re "Unable to contact" { log_error "Slurm appears to be down" exp_continue } timeout { log_error "scontrol not responding" slow_kill $my_pid set return_code 1 } eof { wait } } if { $return_code } { return $return_code } set match 0 set my_pid [spawn $smap -Db -c -h -n $node -I $ionodes] expect { -nocase -re "$state" { incr match exp_continue } -re "$node" { incr match exp_continue } -re "Unable to contact" { log_error "Slurm appears to be down" exp_continue } timeout { log_error "smap not responding" slow_kill $my_pid set return_code 1 } eof { wait } } if {$match != 2} { log_error "Subbp did not go into $state state. $match" set return_code 1 } return $return_code } ################################################# # # NAME # scale_to_megs - scales the value by the factor T|G|M to megabytes # # SYNOPSIS # scale_to_megs value factor # # DESCRIPTION # scale the value by the factor T|G|M to megabytes # # RETURN VALUE # the scaled variable # ################################################# proc scale_to_megs { value factor } { if {[string compare $factor "T"] == 0} { set value [expr $value * 1024 * 1024] } elseif {[string compare $factor "G"] == 0} { set value [expr $value * 1024] } elseif {[string compare $factor "M"] == 0} { set value [expr $value * 1] } elseif {[string compare $factor "K"] == 0} { set value [expr $value / 1024] set value [expr {round($value)}] } else { set value [expr $value / (1024 * 1024)] set value [expr {round($value)}] } return $value } ################################################# # # NAME # scale_to_ks - scales the value by the factor G|M|K to kilobytes # # SYNOPSIS # scale_to_ks value factor # # DESCRIPTION # scale the value by the factor G|M|K to kilobytes # # RETURN VALUE # the scaled variable # ################################################# proc scale_to_ks { value factor } { if {[string compare $factor "G"] == 0} { set value [expr $value * 1024 * 1024] } elseif {[string compare $factor "M"] == 0} { set value [expr $value * 1024] } elseif {[string compare $factor "K"] == 0} { set value [expr $value * 1] } else { set value [expr $value / 1024] set value [expr {round($value)}] } return $value } ############################################################ # # NAME # check_config_node_mem - checks that the nodes have memory configured # # SYNOPSIS # check_config_node_mem # # RETURN VALUE # true if all nodes have memory, false otherwise # ############################################################ proc check_config_node_mem { } { set nodes_dict [get_nodes] dict for {node_name node_dict} $nodes_dict { if [dict exists $node_dict "RealMemory"] { if {[dict get $node_dict "RealMemory"] == 1} { return false } } else { log_warn "Parameter RealMemory not found on node $node_name" return false } } return true } ################################################################ # # NAME # slurmctld_plug_stack_nonstop - verifies that the SlurmctldPlugstack is set to nonstop # # SYNOPSIS # slurmctld_plug_stack_nonstop # # DESCRIPTION # Use scontrol to determine that the SlurmctldPlugstack is set to nonstop. # # RETURN VALUE # 1 if the value is set to nonstop. # ################################################################ proc slurmctld_plug_stack_nonstop { } { global scontrol re_word_str exit_code log_user 0 set nonstop_enforce 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "SlurmctldPlugstack *= ($re_word_str)" { if {[string first $expect_out(1,string) "nonstop"] != -1} { set nonstop_enforce 1 } exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $nonstop_enforce } ################################################################ # # NAME # job_submit_all_partitions - determines if the JobSubmitPlugins includes "all_partitions" # # SYNOPSIS # job_submit_all_partitions # # DESCRIPTION # Use scontrol to determine if the JobSubmitPlugins # includes "all_partitions". # # RETURN VALUE # 1 if the value is set to nonstop. # ################################################################ proc job_submit_all_partitions { } { global scontrol re_word_str exit_code log_user 0 set all_partitions 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "JobSubmitPlugins *= ($re_word_str)" { if {[string first $expect_out(1,string) "all_partitions"] != -1} { set all_partitions 1 } exp_continue } timeout { log_error "scontrol not responding" slow_kill $scon_pid set exit_code 1 } eof { wait } } log_user 1 return $all_partitions } ################################################################ # # NAME # wait_for_node - waits for nodes in a partition to reach a certain state # # SYNOPSIS # wait_for_node ?options? state num_nodes ?partition? # # DESCRIPTION # Wait for a certain number of nodes in a partition to reach a certain # state. # # OPTIONS # -timeout # time in seconds to wait for the node state before # timing out (default is 3) # -pollinterval # time in seconds between each node state check (default is 1) # # ARGUMENTS # state # The node state to wait for # num_nodes # The number of nodes we want to be in the specified state # partition # Partition name (the default partition is used if not specified) # # RETURN VALUE # RETURN_SUCCESS, or non-zero on failure # ################################################################ proc wait_for_node args { global sinfo number set partition "" set timeout 3 set poll_interval 1 set desired_state "PENDING" set desired_reason_list "" while {[llength $args]} { switch -glob -- [lindex $args 0] { -time* {set args [lassign $args - timeout]} -poll* {set args [lassign $args - poll_interval]} -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count < 2} { fail "Too few arguments ($argument_count): $args" } else { lassign $args state num_nodes } if {$argument_count == 3} { set partition [lindex $args 2] } if {$argument_count > 3} { fail "Too many arguments ($argument_count): $args" } set wait_time 0 set done 0 set cnt 0 set rt $::RETURN_SUCCESS if {[string length $partition] == 0} { set partition [default_partition] } while {$done != 1 && $wait_time < $timeout} { log_user 0 spawn $sinfo --noheader --partition $partition --state $state --format %D expect { -re "($number)" { set cnt $expect_out(1,string) exp_continue } timeout { log_error "sinfo is not responding" set rt $::RETURN_TIMEOUT } eof { wait } } log_user 1 if {$num_nodes <= $cnt} { set done 1 } else { log_debug "Partition $partition has $cnt nodes idle and we want $num_nodes" sleep $poll_interval incr wait_time 1 } } if {$done != 1} { set rt $::RETURN_ERROR } return $rt } ##################################################################### # # NAME # available_nodes_hostnames - gets all available nodes in the system # # SYNOPSIS # available_nodes_hostnames partition ?also_power_save? # # ARGUMENTS # # partition # to only return nodes of an specific partition # also_power_save # to include POWERING_DOWN and POWER_DOWN nodes # # RETURN VALUE # idle nodes, and also_power_save nodes if specified # ##################################################################### proc available_nodes_hostnames { partition {also_power_save false} } { global sinfo re_word_str exit_code log_user 0 set idle_nodelist "" set sep "" if {$also_power_save} { set avail_states "idle,power_down,powering_down" } else { set avail_states "idle" } if {[string compare $partition ""] == 0} { spawn $sinfo -t$avail_states -h -o%N } else { spawn $sinfo -t$avail_states -h -o%N -p$partition } expect { -re "($re_word_str)" { append idle_nodelist $sep append idle_nodelist $expect_out(1,string) set sep "," exp_continue } timeout { log_error "sinfo is not responding" set exit_code 1 } eof { wait } } log_user 1 return $idle_nodelist } ##################################################################### # # NAME # get_partition_nodes - gets the list of node names in a given partition/states # # SYNOPSIS # get_partition_nodes partition states # # DESCRIPTION # Get the list of node names in a given partition/states # # ARGUMENTS # partition # partition to get nodes off # states # states to filter on nodes # # RETURN VALUE # node names list, -1 on sinfo error # ##################################################################### proc get_partition_nodes {partition states} { global sinfo re_word_str log_user 0 set node_list "" if {[string length $partition] == 0} { set partition [default_partition] } if {[string length $states] == 0} { set sinfo_pid [spawn -noecho $sinfo -h -N -p $partition -o %N -e] } else { set sinfo_pid [spawn -noecho $sinfo -h -N -p $partition -o %N -t $states -e] } expect { -re "($re_word_str)" { lappend node_list $expect_out(1,string) exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 return $node_list } ##################################################################### # # NAME # set_partition_maximum_time_limit - sets the maximum time limit in a given partition # # SYNOPSIS # set_partition_maximum_time_limit partition limit # # RETURN VALUE # RETURN_SUCCESS, or non-zero on error # ##################################################################### proc set_partition_maximum_time_limit {partition limit} { global scontrol exit_code if {[string length $partition] == 0} { set partition [default_partition] if { $partition == "" } { return $::RETURN_ERROR } } if { $limit < -1 } { log_error "Trying to set invalid partition time limit of $limit" return $::RETURN_ERROR } if { $limit == -1 } { set expected_lim "UNLIMITED" } else { set expected_lim limit } spawn $scontrol update partitionname=$partition MaxTime=-1 expect { timeout { log_error "scontrol not responding" return $::RETURN_TIMEOUT } eof { wait } } set maxtime [get_partition_maximum_time_limit $partition] if { $maxtime != $limit } { log_error "Unable to update partition MaxTime, got $maxtime, wanted $limit" return $::RETURN_ERROR } if { $exit_code != 0 } { log_error "set_partition_maximum_time_limit: Unexpected error." return $::RETURN_ERROR } return $::RETURN_SUCCESS } ##################################################################### # # NAME # get_partition_maximum_time_limit - gets the maximum time limit in a given partition # # SYNOPSIS # get_partition_maximum_time_limit partition # # DESCRIPTION # Get the maximum time limit in a given partition # # RETURN VALUE # time limit in seconds, -1 if undefined or error # ##################################################################### proc get_partition_maximum_time_limit {partition} { global sinfo number exit_code if {[string length $partition] == 0} { set partition [default_partition] } set secs 0 log_user 0 set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O time -e] expect { -re "infinite" { set secs -1 exp_continue } -re "n/a" { set secs -1 exp_continue } -re "($number)-($number):($number):($number)" { set days [expr $expect_out(1,string) * 24 * 60 * 60] set hours [expr $expect_out(2,string) * 60 * 60] set mins [expr $expect_out(3,string) * 60] set secs [expr $days + $hours + $mins + $expect_out(4,string)] exp_continue } -re "($number):($number):($number)" { set hours [expr $expect_out(1,string) * 60 * 60] set mins [expr $expect_out(2,string) * 60] set secs [expr $hours + $mins + $expect_out(3,string)] exp_continue } -re "($number):($number)" { set mins [expr $expect_out(1,string) * 60] set secs [expr $mins + $expect_out(2,string)] exp_continue } -re "($number)" { set secs [expr $expect_out(1,string) * 60] exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 return $secs } ################################################################ # # NAME # get_partition_default_time_limit - gets the default time limit in a given partition # # SYNOPSIS # get_partition_default_time_limit ?partition? # # DESCRIPTION # Get the default time limit in a given partition. # If the partition is not specified, the default partition will be used. # # RETURN VALUE # Returns: time limit in seconds, -1 if undefined or error. # ################################################################ proc get_partition_default_time_limit { {partition ""} } { global sinfo number exit_code if {[string length $partition] == 0} { set partition [default_partition] } set secs 0 log_user 0 set sinfo_pid [spawn -noecho $sinfo -h -p $partition -O defaulttime -e] expect { -re "infinite" { set secs -1 exp_continue } -re "n/a" { set secs -1 exp_continue } -re "($number)-($number):($number):($number)" { set days [expr $expect_out(1,string) * 24 * 60 * 60] set hours [expr $expect_out(2,string) * 60 * 60] set mins [expr $expect_out(3,string) * 60] set secs [expr $days + $hours + $mins + $expect_out(4,string)] exp_continue } -re "($number):($number):($number)" { set hours [expr $expect_out(1,string) * 60 * 60] set mins [expr $expect_out(2,string) * 60] set secs [expr $hours + $mins + $expect_out(3,string)] exp_continue } -re "($number):($number)" { set mins [expr $expect_out(1,string) * 60] set secs [expr $mins + $expect_out(2,string)] exp_continue } -re "($number)" { set secs [expr $expect_out(1,string) * 60] exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 return $secs } ##################################################################### # # NAME # get_node_cores - given a node, returns its total number of cores # # SYNOPSIS # get_node_cores node # # DESCRIPTION # Given a node, return its total number of cores # (not the CoresPerSocket, but the total cores) # # RETURN VALUE # node cores if retrieved, -1 otherwise # ##################################################################### proc get_node_cores {node} { global sinfo number set cores -1 set sockets_per_node 0 set cores_per_socket 0 if {[string length $node] == 0} { return $cores } log_user 0 set sinfo_pid [spawn -noecho $sinfo -o "%X %Y" -h -n $node] expect { -re "($number)" { if {$sockets_per_node == 0} { set sockets_per_node $expect_out(1,string) } else { set cores_per_socket $expect_out(1,string) } exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 set cores [expr $sockets_per_node * $cores_per_socket] return $cores } ##################################################################### # # NAME # get_node_cpus - given a node, returns its total number of threads we account for # # SYNOPSIS # get_node_cpus node # # DESCRIPTION # Given a node, return its total number of threads we account for. # (not always ThreadsPerCore, but how many threads are in use. # i.e. CPUs=6 CoresPerSocket=6 ThreadsPerCore=2 Socket=1 would # result in only 1 thread we care about instead of the 2 listed.) # # RETURN VALUE # list of node [ tot_cpus threads ] if retrieved, [ -1 -1 ] otherwise # ##################################################################### proc get_node_cpus {node} { global scontrol number set nthreads -1 set nsockets 0 set ncores 0 set totcpus -1 if {[string length $node] == 0} { return [list $totcpus $nthreads] } # Get the number of CPUs on a node set my_pid [spawn $scontrol show node $node] expect { -re "CoresPerSocket=($number)" { set ncores $expect_out(1,string) exp_continue } -re "CPUTot=($number)" { set totcpus $expect_out(1,string) exp_continue } -re "Sockets=($number)" { set nsockets $expect_out(1,string) exp_continue } -re "ThreadsPerCore=($number)" { set nthreads $expect_out(1,string) exp_continue } timeout { log_error "scontrol is not responding" slow_kill $scontrol_pid set exit_code 1 } eof { wait } } set core_cnt [expr $nsockets * $ncores] set thread_cnt [expr $ncores * $nthreads] if {$totcpus != $nthreads && $totcpus == $ncores} { log_debug "Cores rather than threads are being allocated" set nthreads 1 } return [list $totcpus $nthreads] } ##################################################################### # # NAME # get_part_total_cores - given a partition and/or states, return its total cores # # SYNOPSIS # get_part_total_cores partition states # # DESCRIPTION # Given a partition and/or states, return its total cores # # ARGUMENTS # partition # partition to check cores # states # states to filter on partition cores # # RETURN VALUE # partition cores # ##################################################################### proc get_part_total_cores {part states} { global sinfo number log_user 0 set cores 0 set tmp 0 set i 0 if {[string length $part] == 0} { set part [default_partition] } if {[string length $states] == 0} { set sinfo_pid [spawn -noecho $sinfo -h -N -p $part -o "%X %Y"] } else { set sinfo_pid [spawn -noecho $sinfo -h -N -p $part -t $states -o "%X %Y"] } expect { -re "($number)" { set is_even [expr {($i % 2) == 0}] if {$is_even == 1} { set tmp $expect_out(1,string) } else { set tmp [expr $tmp * $expect_out(1,string)] set cores [expr $cores + $tmp] } incr i exp_continue } timeout { log_error "sinfo not responding" slow_kill $sinfo_pid set exit_code 1 } eof { wait } } log_user 1 return $cores } ##################################################################### # # NAME # check_hosts_contiguous - verify if all hosts belong to the partition and are contiguous # # SYNOPSIS # check_hosts_contiguous check_hosts_list partition # # DESCRIPTION # Given a partition and a list of hosts, verify if all # hosts belong to the partition and are contiguous. # If the partition argument is empty, the default partition # will be used. # # RETURN VALUE # Returns: true if hosts are contiguous, false otherwise. # ##################################################################### proc check_hosts_contiguous { check_hosts_list {partition ""} } { global sinfo re_word_str if {[string length $partition] == 0} { set partition [default_partition] } set part_hosts_list {} log_user 0 set sinfo_pid [spawn $sinfo --noheader -p $partition -N -o %N] expect { -re "($re_word_str)" { lappend part_hosts_list $expect_out(1,string) exp_continue } -re "Unable to contact" { log_user 1 fail "Slurm appears to be down" } timeout { log_user 1 slow_kill $sinfo_pid fail "sinfo not responding" } eof { wait } } log_user 1 foreach host $check_hosts_list { set idx_cur [lsearch $part_hosts_list $host] if {$idx_cur == -1} { fail "$host not found in list of hosts from partition $partition" } if {[info exists idx_old]} { if {$idx_cur != [expr $idx_old + 1]} { log_error "Node sequence number not contiguous" return false } } set idx_old $idx_cur } return true } ##################################################################### # # NAME # stop_root_user - detect, warn, and stop root user # # SYNOPSIS # stop_root_user # # DESCRIPTION # Detect, warn, and stop root user # # RETURN VALUE # uid if not root user, exits otherwise # ##################################################################### proc stop_root_user {} { global bin_id number set uid [get_my_uid] if {$uid == -1} { fail "Can't get my uid" } elseif {$uid == 0} { skip "Can't run this test as user root" } return $uid } proc expect_extern_step { } { global scontrol # If PrologFlags=contain is in slurm.conf, then an "extern" step will be # launched on each node, so we need to check for 3 steps per # job instead of 2. log_user 0 set extern_step 0 set scon_pid [spawn -noecho $scontrol show config] expect { -re "PrologFlags\\s*=\\s*\[A-z/,\]*Contain" { set extern_step 1 } timeout { fail "scontrol show config not responding" } eof { wait } } log_user 1 return $extern_step } ################################################################ # # NAME # get_het_job_ids - gets list of component job ids for a het job # # SYNOPSIS # get_het_job_ids job_id ?use_offset? # # DESCRIPTION # Gets list of component job ids for a het job. # # ARGUMENTS # job_id # Slurm job id # use_offset # If zero, returns list of integer job ids, else returns ids in # the form of X+Y where X is het job master id and Y is the # offset. # # RETURN VALUE # A list of ids for a hetjob or an empty list if jobid # is not a het one. # ################################################################ proc get_het_job_ids { jobid {use_offset 0}} { global scontrol number set id_list "" set log_user_save [log_user -info] log_user 0 spawn $scontrol show job $jobid expect { -re "JobId=($number) HetJobId=($number) HetJobOffset=($number)" { if { $use_offset } { lappend id_list "$expect_out(2,string)+$expect_out(3,string)" } else { lappend id_list $expect_out(1,string) } exp_continue } timeout { log_error "scontrol not responding" set id_list "" } eof { wait } } log_user $log_user_save return $id_list } ################################################################ # # NAME # reconfigure - calls scontrol reconfigure # # SYNOPSIS # reconfigure ?cluster? # # DESCRIPTION # Calls scontrol reconfigure. # # ARGUMENTS # cluster # The cluster to reconfigure # # RETURN VALUE # void # # ENVIRONMENT # Sets exit_code to 1 on failure. # ################################################################ proc reconfigure { {cluster ""} } { global exit_code scontrol timeout # # Increase timeout just in case we're running under valgrind # set save_timeout $timeout set timeout 20 if { $cluster == "" } { spawn $scontrol reconfigure } else { spawn $scontrol -M$cluster reconfigure } expect { -re "slurm_reconfigure error: Invalid user id" { log_error "Invalid user id" set exit_code 1 exp_continue } -re "Error|error" { log_error "scontrol reconfigure error" set exit_code 1 exp_continue } timeout { log_error "scontrol not responding" set exit_code 1 } eof { wait } } # # Wait for reconfigure to complete, then reset timeout and return. # sleep 5 set timeout $save_timeout } ##################################################################### # # NAME # log_fatal - prints a fatal message # # SYNOPSIS # log_fatal message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_fatal {message} { global testsuite_log_level LOG_LEVEL_FATAL if {$testsuite_log_level >= $LOG_LEVEL_FATAL} { _log_format "fatal" "$message" } } ##################################################################### # # NAME # log_error - prints an error message # # SYNOPSIS # log_error message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_error {message} { global testsuite_log_level LOG_LEVEL_ERROR if {$testsuite_log_level >= $LOG_LEVEL_ERROR} { _log_format "error" "$message" } } ##################################################################### # # NAME # log_warn - prints a warning message # # SYNOPSIS # log_warn message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_warn {message} { global testsuite_log_level LOG_LEVEL_WARNING if {$testsuite_log_level >= $LOG_LEVEL_WARNING} { _log_format "warning" "$message" } } ##################################################################### # # NAME # log_info - prints an information message # # SYNOPSIS # log_info message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_info {message} { global testsuite_log_level LOG_LEVEL_INFO if {$testsuite_log_level >= $LOG_LEVEL_INFO} { _log_format "info" "$message" } } ##################################################################### # # NAME # log_debug - prints a debug level message # # SYNOPSIS # log_debug message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_debug {message} { global testsuite_log_level LOG_LEVEL_DEBUG if {$testsuite_log_level >= $LOG_LEVEL_DEBUG} { _log_format "debug" "$message" } } ##################################################################### # # NAME # log_trace - prints a trace level message # # SYNOPSIS # log_trace message # # SEE ALSO # _log_format for options governing the message format and colorization # ##################################################################### proc log_trace {message} { global testsuite_log_level LOG_LEVEL_TRACE if {$testsuite_log_level >= $LOG_LEVEL_TRACE} { _log_format "trace" "$message" } } ################################################################ # # NAME # in_fed - checks whether this cluster is in a federation # # SYNOPSIS # in_fed # # RETURN VALUE # Returns true if this cluster is in a federation, false otherwise # ################################################################ proc in_fed {} { global scontrol spawn $scontrol show fed expect { -re "Federation" { return true } timeout { log_error "scontrol not responding" } eof { wait } } return false } ################################################################ # # NAME # check_job_state - checks if the state of a job is the expected one # # SYNOPSIS # check_job_state job state ?het_job? # # DESCRIPTION # Checks if the state of a job is the expected one. # # ARGUMENTS # job # Job ID to check # state # Desired state of the job to match # het_job # If set, checks state of each component job if the # job is a hetjob. # # RETURN VALUE # true if job was on the desired state, or the number of job components # on that state if it's a hetjob and het_job option enabled, false # otherwise. # # ENVIRONMENT # Also sets exit_code to 1 if there are some error in the called commands. # ################################################################ proc check_job_state { job state {het_job 0}} { global scontrol exit_code set jid_list "" if { $het_job } { set jid_list [get_het_job_ids $job 1] } if { $jid_list == "" } { # non-het job set jid_list $job } foreach jid $jid_list { set state_match 0 spawn $scontrol show job $jid expect { -re "JobState=($state)" { incr state_match } timeout { log_error "scontrol not responding" set exit_code 1 return false } eof { wait } } if {$state_match != 1} { log_error "job $jid should be in $state state, but is not" return false } } return true } ################################################################ # # NAME # get_gres_count - returns a dict of nodes and GRES counts # # SYNOPSIS # get_gres_count gres_name ?node_list? # # DESCRIPTION # Returns a dict of node names and the count of a specifed # GRES aggregating all its types on each node. # # RETURN VALUE # If the node_list is not specified node name is specified, # this function will return a dict with the GRES count for all # the nodes of the default partition. # If specified, a dict only with the nodes of the node_list. # ################################################################ proc get_gres_count { gres_name {node_list ""} } { set nodes_dict [get_nodes $node_list] set nodes_gres_dict [dict create] dict for {node_name node_dict} $nodes_dict { if [dict exists $node_dict "Gres"] { set gres_param [dict get $node_dict "Gres"] set gres_dict [count_gres $gres_param] } if [dict exists $gres_dict $gres_name] { set gres_count [dict get $gres_dict $gres_name] dict set nodes_gres_dict $node_name $gres_count } } return $nodes_gres_dict } ################################################################ # # NAME # count_gres - returns a dict of GRES names and their total counts # # SYNOPSIS # count_gres gres_param # # DESCRIPTION # Parses a GRES parameter string typically obtained from nodes or # jobs info, and returns a dict of GRES names and their count # aggregating all the types of each GRES. # # RETURN VALUE # A dict of GRES names and their count aggregating all types of # each GRES. # ################################################################ proc count_gres { gres_param } { global gres_regex set gres_dict [dict create] foreach gres [split $gres_param ","] { if {[regexp $gres_regex $gres {} name type count] == 1} { if {$count eq ""} { set count $type } if {[dict exists $gres_dict $name]} { dict set gres_dict $name [expr [dict get $gres_dict $name] + $count] } else { dict set gres_dict $name $count } } } return $gres_dict } ################################################################ # # NAME # get_highest_gres_count - returns highest number of GRES per node on node_count nodes # # SYNOPSIS # get_highest_gres_count node_count gres_name # # DESCRIPTION # For a given number of nodes, returns the highest GRES count per # node available on at least that number of nodes. # # EXAMPLE # For example: node1 has 1 GPU, node2 has 2 GPUs and node3 has 3 GPUs # [get_highest_gres_count 1 "gpu"] returns 3 (i.e. 1 node 3 GPUs) # [get_highest_gres_count 2 "gpu"] returns 2 (i.e. 2 nodes have at least 2 GPUs each) # [get_highest_gres_count 3 "gpu"] returns 1 (i.e. 3 nodes have at least 1 GPU each) # ################################################################ proc get_highest_gres_count { node_count gres_name } { set available_nodes [available_nodes_hostnames [default_partition]] set gres_dict [get_gres_count $gres_name $available_nodes] set gres_count [list] dict for {node gres} $gres_dict { lappend gres_count $gres } set count [lindex [lsort -decreasing -integer $gres_count] [expr $node_count - 1]] return $count } ################################################################ # # NAME # _set_gpu_socket_inx - adds a socket index to the gpu_sock_list if not already on it # # SYNOPSIS # _set_gpu_socket_inx sock_inx # # DESCRIPTION # Add a socket index to the array gpu_sock_list if not already # on the list. Subroutine used by get_gpu_socket_count # ################################################################ proc _set_gpu_socket_inx { sock_inx } { global gpu_sock_list if {$sock_inx == -1} { set gpu_sock_list [lreplace $gpu_sock_list 0 99] return } set sock_cnt [llength $gpu_sock_list] for {set i 0} {$i < $sock_cnt} {incr i} { if {[lindex $gpu_sock_list $i] == $sock_inx} { return } } lappend gpu_sock_list $sock_inx } ################################################################ # Subroutine used by get_gpu_socket_count # Add a socket index to the array gpu_sock_list if not already # on the list. ################################################################ proc _set_gpu_socket_range { sock_first_inx sock_last_inx } { global gpu_sock_list set sock_cnt [llength $gpu_sock_list] for {set s $sock_first_inx} {$s <= $sock_last_inx} {incr s} { set found 0 for {set i 0} {$i < $sock_cnt} {incr i} { if {[lindex $gpu_sock_list $i] == $s} { set found 1 break } } if {$found == 0} { lappend gpu_sock_list $s } } } ################################################################ # # NAME # get_gpu_socket_count - returns the number of sockets with GPUS on a node with the given per-node GPU count # # SYNOPSIS # get_gpu_socket_count gpu_cnt sockets_per_node # # DESCRIPTION # Given a per-node GPU count, return the number of sockets with # GPUs on a node with the given per-node GPU count. # If the sockets_per_node has a value of 1 then just return 1 # rather than determine the count (for performance reasons). # ################################################################ proc get_gpu_socket_count { gpu_cnt sockets_per_node } { global re_word_str bin_rm number scontrol srun global gpu_sock_list set sockets_with_gpus 1 set file_in "test_get_gpu_socket_count.input" if {$sockets_per_node == 1} { return 1 } log_user 0 _set_gpu_socket_inx -1 make_bash_script $file_in "$scontrol show node \$SLURMD_NODENAME" spawn $srun -N1 --gres=gpu:$gpu_cnt $file_in expect { -re "gpu:${number}.S:($number)-($number)" { _set_gpu_socket_range $expect_out(1,string) $expect_out(2,string) exp_continue } -re "gpu:${re_word_str}:${number}.S:($number),($number),($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) _set_gpu_socket_inx $expect_out(3,string) _set_gpu_socket_inx $expect_out(4,string) exp_continue } -re "gpu:${re_word_str}:${number}.S:($number),($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) _set_gpu_socket_inx $expect_out(3,string) exp_continue } -re "gpu:${re_word_str}:${number}.S:($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) exp_continue } -re "gpu:${re_word_str}:${number}.S:($number)" { _set_gpu_socket_inx $expect_out(1,string) exp_continue } -re "gpu:${number}.S:($number),($number),($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) _set_gpu_socket_inx $expect_out(3,string) _set_gpu_socket_inx $expect_out(4,string) exp_continue } -re "gpu:${number}.S:($number),($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) _set_gpu_socket_inx $expect_out(3,string) exp_continue } -re "gpu:${number}.S:($number),($number)" { _set_gpu_socket_inx $expect_out(1,string) _set_gpu_socket_inx $expect_out(2,string) exp_continue } -re "gpu:${number}.S:($number)" { _set_gpu_socket_inx $expect_out(1,string) exp_continue } eof { wait } } log_user 1 exec $bin_rm -f $file_in set sock_cnt [llength $gpu_sock_list] if {$sock_cnt > 1} { set sockets_with_gpus $sock_cnt } return $sockets_with_gpus } ################################################################ # # NAME # get_highest_mps_count - get_highest_gres_count nodes mps, but for "mps per GPU" # # SYNOPSIS # get_highest_mps_count node_count # # DESCRIPTION # For a given number of nodes, returns the higest number of MPS per GPU # available at least on those number of nodes. # ################################################################ proc get_highest_mps_count { node_count } { # We cannot use get_highest_gres_count because we need "per gpu", # so we get all the mps per node and all gpus per node, to create # a mps_per_gpu list to sort and get the count. set available_nodes [available_nodes_hostnames [default_partition]] set mps_dict [get_gres_count "mps" $available_nodes] set gpu_dict [get_gres_count "gpu" $available_nodes] set mps_per_gpu [list] dict for {node mps} $mps_dict { if { $mps > 0 } { if [dict exists $gpu_dict $node] { set gpu [dict get $gpu_dict $node] if { $gpu > 0 } { lappend mps_per_gpu [expr $mps / $gpu] } else { fail "All nodes with MPS should have a GPU" } } else { fail "All nodes with MPS should have a GPU" } } } set count [lindex [lsort -decreasing -integer $mps_per_gpu] [expr $node_count - 1]] return $count } ################################################################ # # NAME # get_mps_node_count - gets the number of nodes with a positive number of GRES MPS # # SYNOPSIS # get_mps_node_count # # RETURN VALUE # Return the count of nodes with a non-zero count of GRES MPS # ################################################################ proc get_mps_node_count { } { global number sinfo re_word_str set fini 0 set node_inx 0 set def_part [default_partition] log_user 0 spawn $sinfo -N -p$def_part -oGRES=%G -h expect { -re "GRES=($re_word_str)" { set mps_count 0 set parts [split $expect_out(1,string) ",/"] while 1 { set mps_found [lsearch $parts "mps*"] if { $mps_found == -1 } break set parts2 [split [lindex $parts $mps_found] ":(/"] set col [lsearch -regexp $parts2 ^$number$] if { $col == -1 } { incr mps_count } else { set mps_count [expr $mps_count + [lindex $parts2 $col]] } set parts [lreplace $parts $mps_found $mps_found] } if {$mps_count > 0} { incr node_inx } exp_continue } eof { wait } } log_user 1 return $node_inx } ################################################################ # # NAME # cuda_count - determines the count of allocated GPUs # # SYNOPSIS # cuda_count cuda_string # # ARGUMENTS # cuda_string # Contents of a CUDA_VISIBLE_DEVICES environment variable # # RETURN VALUE # Return the number of GPUs or -1 on error # ################################################################ proc cuda_count { cuda_string } { set cuda_count 0 set has_number 0 set len [string length $cuda_string] for {set char_inx 0} {$char_inx < $len} {incr char_inx} { set cuda_char [string index $cuda_string $char_inx] if {[string match , $cuda_char]} { if {$has_number > 0} { incr cuda_count set has_number 0 } else { log_error "Invalid input ($cuda_string)" return -1 } } elseif {[string is digit $cuda_char]} { set has_number 1 } } if {$has_number > 0} { incr cuda_count } else { log_error "Invalid input ($cuda_string)" return -1 } return $cuda_count } ################################################################ # NAME # get_conf_path - gets the path to the slurm.conf file # # SYNOPSIS # get_conf_path # # RETURN VALUE # Returns the path to the slurm.conf file # ################################################################ proc get_conf_path { } { global scontrol re_word_str eol exit_code if [regexp {(.*)/slurm.conf} [get_config_param "SLURM_CONF"] {} config_dir] { return $config_dir } else { fail "Unable to determine config dir" } } ################################################################ # # NAME # copy_conf - backs up the slurm.conf file # # SYNOPSIS # copy_conf config_path cwd # # DESCRIPTION # Copy the slurm.conf file to the a new file called # slurm.conf.orig in the current working directory # # ARGUMENTS # config_path # The path to slurm.conf # cwd # The full path of the current working directory # ################################################################ proc copy_conf { config_path cwd } { global bin_cp bin_rm exit_code exec $bin_rm -fr $cwd/slurm.conf.orig spawn $bin_cp -v $config_path/slurm.conf $cwd/slurm.conf.orig expect { timeout { log_error "slurm.conf was not copied" set exit_code 1 } eof { wait } } } ################################################################ # # NAME # save_conf - saves a backup of the specfied configuration file # # SYNOPSIS # save_conf file_name # # DESCRIPTION # If the specified file_name exists, a backup is made which will be # restored when restore_conf is called. # If the specified file_name does not exist, a special backup will be # made that will cause the file to be removed when restore_conf is # called. # If a backup already exists, a warning is issued and no backup is made # (honoring the existing backup). # # SEE ALSO # restore_conf # ################################################################ proc save_conf { file_name } { global test_id global bin_chmod bin_cp bin_mv bin_touch log_debug "Saving backup of $file_name" # # Check for existing backup # If a backup exists, issue a warning and return (honor existing backup) # set conf_dir [file dirname $file_name] set dir_files [glob -nocomplain -directory $conf_dir *] set preexisting_backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"] if {$preexisting_backup_file ne ""} { log_warn "Backup file already exists: ($preexisting_backup_file)" return } # # Check if file to backup exists. # If it doesn't exist, warn the user, touch an empty backup file with # the sticky bit set and allow the test to continue. # restore_conf will remove the file. # set new_backup_file "$file_name.test$test_id" if {![file exists $file_name]} { log_warn "Backup of a nonexistent file requested: $file_name" run_command -fail -nolog "$bin_touch $new_backup_file" run_command -fail -nolog "$bin_chmod +t $new_backup_file" return } run_command -fail -nolog "$bin_mv $file_name $new_backup_file" run_command -fail -nolog "$bin_cp $new_backup_file $file_name" } ################################################################ # # NAME # restore_conf - restores the original confiration file from backup # # SYNOPSIS # restore_conf file_name # # DESCRIPTION # If a backup exists for the specified file_name, it is restored. # If the specified file_name did not exist when originally backed up, # it will be removed. # If no backup exists, a warning is issued. # # SEE ALSO # save_conf # ################################################################ proc restore_conf { file_name } { global test_id global bin_mv bin_rm log_debug "Restoring backup of $file_name" set conf_dir [file dirname $file_name] set dir_files [glob -nocomplain -directory $conf_dir *] set backup_file [lsearch -inline -regexp $dir_files "$file_name\\\.test\\d+\\\.\\d+\$"] if {$backup_file ne ""} { file stat $backup_file stat # If the sticky bit is set and the file is empty, remove both if {! $stat(size) && [expr $stat(mode) & 512]} { log_debug "Removing file used for the test: $file_name" run_command -fail -nolog "$bin_rm -f $backup_file $file_name" # Else replace the original with the backup } else { run_command -fail -nolog "$bin_mv $backup_file $file_name" } } else { # # If backup file doesn't exist, it has probably already been # restored by a previous call to restore_conf # log_warn "Backup file does not exist for $file_name. It has probably already been restored" return } } ################################################################ # # NAME # have_nvml - checks if HAVE_NVML is set in config.h # # SYNOPSIS # have_nvml # # RETURN VALUE # Returns true if HAVE_NVML is set in config.h. Else, returns false # ################################################################ proc have_nvml { } { global bin_grep number exit_code config_h set grep_fail 0 set have_nvml false log_user 0 spawn $bin_grep "HAVE_NVML" $config_h expect { -re "HAVE_NVML 1" { set have_nvml true exp_continue } timeout { set grep_fail 1 set exit_code 1 } eof { wait } } log_user 1 if {$grep_fail == 1} { log_warn "Could not grep $config_h for HAVE_NVML" set have_nvml false } return $have_nvml } ################################################################ # # NAME # delete_part - deletes partition on system # # SYNOPSIS # delete_part partition # ################################################################ proc delete_part { part_name } { global scontrol # Remove part spawn $scontrol delete partition=$part_name expect { timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } } ################################################################ # # NAME # have_lua - checks if HAVE_LUA is set in config.h # # SYNOPSIS # have_lua # # RETURN VALUE # Returns true if HAVE_LUA is set in config.h. Else, returns false # ################################################################ proc have_lua { } { global bin_grep number exit_code config_h set grep_fail 0 set have_lua false log_user 0 spawn $bin_grep "HAVE_LUA" $config_h expect { -re "HAVE_LUA 1" { set have_lua true exp_continue } timeout { set grep_fail 1 set exit_code 1 } eof { wait } } log_user 1 if {$grep_fail == 1} { log_error "Could not grep $config_h for HAVE_LUA" set have_lua false } return $have_lua } ################################################################ # # NAME # get_reservations - returns a dictionary of dictionaries of reservation parameters # # SYNOPSIS # get_reservations ?resv_name? # # RETURN VALUE # Uses `scontrol show reservation` to return a dictionary of dictionaries # of job parameters. Specifying an invalid resv_name result in a failure. # ################################################################ proc get_reservations { {resv_name ""} } { global scontrol set command "$scontrol show reservation -o" if {$resv_name ne ""} { append command " $resv_name" } set output [run_command_output -fail "$command"] # Iterate over each reservation's parameter list foreach line [split $output "\n"] { if {$line eq ""} { continue } # Peel off the resv parameters one at a time # The first quantifier sets the greediness for the whole RE while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { # Remove the consumed parameter from the line set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] # Add it to the temporary job dictionary dict set resv_dict $param_name $param_value } set resv_name_dict [dict get $resv_dict "ReservationName"] # Add the resv dictionary to resvs dictionary dict set resvs_dict $resv_name_dict $resv_dict # Clear the resv dictionary for the next resv set resv_dict {} } return $resvs_dict } ################################################################ # # NAME # get_resvation_param - returns a specific parameter value for a specific reservation # # SYNOPSIS # get_resvation_param resv_name parameter_name # # DESCRIPTION # Returns a specific parameter value for a specified resvation if the # parameter exists for the resvation, or MISSING if it does not exist. # Specifying an invalid resvation name will result in a failure. # ################################################################ proc get_resvation_param { resv_name parameter_name } { set resvs_dict [get_reservations $resv_name] if [dict exists $resvs_dict $resv_name $parameter_name] { return [dict get $resvs_dict $resv_name $parameter_name] } else { return "MISSING" } } ################################################################ # # NAME # create_res - create new reservation in system # # SYNOPSIS # create_res ?res_name? ?res_params? # # RETURN VALUE # the exit code of the scontrol command run # ################################################################ proc create_res { res_name res_params } { global scontrol set result [run_command "$scontrol create res ReservationName=$res_name $res_params"] set output [dict get $result output] set ret_code [dict get $result exit_code] if { $ret_code } { log_warn "[lindex [info level 0] 0]: error from scontrol: $output" } else { log_debug "[lindex [info level 0] 0]: success from scontrol: $output" } return $ret_code } ################################################################ # # NAME # update_res - update exisiting reservation in system # # SYNOPSIS # update_res ?res_name? ?res_params? # # RETURN VALUE # the exit code of the scontrol command run # ################################################################ proc update_res { res_name res_params } { global scontrol set result [run_command "$scontrol update ReservationName=$res_name $res_params"] set output [dict get $result output] set ret_code [dict get $result exit_code] if { $ret_code } { log_warn "Return code from scontrol: $ret_code. Output: $output" } return $ret_code } ################################################################ # # NAME # delete_res - delete reservation from system # # SYNOPSIS # delete_res ?res_name? # # RETURN VALUE # the exit code of the scontrol command run # ################################################################ proc delete_res { res_name } { global scontrol set result [run_command "$scontrol delete ReservationName=$res_name"] set output [dict get $result output] set ret_code [dict get $result exit_code] if { $ret_code } { log_warn "Return code from scontrol: $ret_code. Output: $output" } return $ret_code } ################################################################ # # NAME # create_part - creates a partition # # SYNOPSIS # create_part partition num_nodes # # ARGUMENTS # partition # Name of partition to create # num_nodes # Number of nodes of partition to create # # RETURN VALUE # RETURN_SUCCESS, or non-zero on error # ################################################################ proc create_part { part_name num_nodes_in } { global scontrol srun bin_printenv number re_word_str set nodes "" set num_nodes_out 0 set found 0 spawn $scontrol show partitionname=$part_name expect { -re "PartitionName=$part_name" { set found 1 exp_continue } timeout { log_err "scontrol is not responding" set exit_code 1 } eof { wait } } if {$found == 1} { log_error "There is already a partition $part_name" return $::RETURN_ERROR } if {[string length [default_partition]] == 0} { log_warn "create_part does not work without a default partition" return $::RETURN_ERROR } if { $num_nodes_in } { set num_nodes $num_nodes_in } else { set num_nodes [available_nodes idle] } log_user 0 # Get a list of nodes spawn $srun -t1 -N1-$num_nodes $bin_printenv expect { -re "SLURM_JOB_NUM_NODES=($number)" { set num_nodes_out $expect_out(1,string) exp_continue } -re "SLURM_NODELIST=($re_word_str)" { set nodes $expect_out(1,string) exp_continue } timeout { log_error "srun is not responding getting number of nodes creating part" return $::RETURN_TIMEOUT } eof { wait } } if {[string length $nodes] == 0} { log_error "Did not get a valid node list" return $::RETURN_ERROR } elseif { $num_nodes_out != $num_nodes_in } { log_error "Did not get enough nodes ($num_nodes_out != $num_nodes_in) to run test" return $::RETURN_ERROR } spawn $scontrol create partitionname=$part_name nodes=$nodes expect { timeout { log_error "scontrol is not responding creating partition" return $::RETURN_ERROR } eof { wait } } set found 0 spawn $scontrol show partitionname=$part_name expect { -re "PartitionName=$part_name" { set found 1 exp_continue } timeout { log_error "scontrol is not responding" set exit_code 1 } eof { wait } } if { $found == 0 } { log_error "scontrol did not create partition $part_name" return $::RETURN_ERROR } log_user 1 log_debug "Created partition $part_name with $num_nodes_in nodes" return $::RETURN_SUCCESS } ################################################################ # # NAME # get_nodes - returns a dictionary of dictionaries of node parameters # # SYNOPSIS # get_nodes ?hostlist_expression? # # DESCRIPTION # Uses `scontrol show node` to query node parameters, returning a # dictionary of dictionaries with the node names as keys of the first # level dictionary and with the parameters as keys of the second level # dictionary. Specifying an invalid node name will result in a failure. # # RETURN VALUE # If the optional node expression argument is specified, the result will # be constrained by the specified hostlist expression. Otherwise, the # results for all nodes will be returned. # ################################################################ proc get_nodes { {hostlist_expression ""} } { global scontrol set command "$scontrol show node -o" if {$hostlist_expression ne ""} { append command " $hostlist_expression" } set output [run_command_output -fail -nolog "$command"] # Iterate over each node parameter line foreach line [split $output "\n"] { if {$line eq ""} { continue } # Peel off the node parameters one at a time # The first quantifier sets the greediness for the whole RE while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { # Remove the consumed parameter from the line set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] # Add it to the temporary node dictionary dict set node_dict $param_name $param_value } set node_name [dict get $node_dict "NodeName"] # Add the node dictionary to nodes dictionary dict set nodes_dict $node_name $node_dict # Clear the node dictionary for use by the next node set node_dict {} } return $nodes_dict } ################################################################ # # NAME # get_node_param - returns a specific parameter value for a specific node # # SYNOPSIS # get_node_param node_name parameter_name # # DESCRIPTION # Returns a specific parameter value for a specified node if the # parameter exists for the node, or MISSING if it does not exist. # Specifying an invalid node name will result in a failure. # ################################################################ proc get_node_param { node_name parameter_name } { set nodes_dict [get_nodes $node_name] if [dict exists $nodes_dict $node_name $parameter_name] { return [dict get $nodes_dict $node_name $parameter_name] } else { return "MISSING" } } ################################################################ # # NAME # get_nodes_by_request - get a list of nodes satisfying requested resources # # SYNOPSIS # get_nodes_by_request ?options? ?request_args? # # DESCRIPTION # Using srun (optionally with the specified arguments), returns a list # of nodes having the requested resources. # If an error occurs, the invoking test will fail. # # OPTIONS # -fail # fail the test if the execution of srun results in an error or timeout # # ARGUMENTS # request_args # Desired resources of a node in form of srun arguments, # e.g. "--gres=gpu:1 -n1 -t1" # # RETURN VALUE # A list of nodes with at least the requested resources, or an empty # list otherwise. # ################################################################ proc get_nodes_by_request args { global srun set options [list] while {[llength $args]} { switch -glob -- [lindex $args 0] { -fail { lappend options [lrange $args 0 0] set args [lrange $args 1 end] } default break } } if {[llength $args] == 1} { lassign $args request_args } elseif {[llength $args] == 0} { set request_args "-n1 -t1" } else { fail "[lindex [info level 0] 0]: Invalid number of arguments ([llength $args]): $args" } set command "$srun -Q $request_args printenv SLURMD_NODENAME" set result [run_command {*}$options $command] if [dict get $result exit_code] { return {} } set output [dict get $result output] foreach line [split $output "\n"] { if {$line eq ""} { continue } dict incr allocated_nodes $line } return [lsort [dict keys $allocated_nodes]] } ################################################################ # # NAME # get_jobs - returns a dictionary of dictionaries of job parameters # # SYNOPSIS # get_jobs ?job_id? # # DESCRIPTION # Uses `scontrol show job` to return a dictionary of dictionaries of job # parameters. Specifying an invalid job id will result in a failure. # ################################################################ proc get_jobs { {job_id_in ""} } { global scontrol set command "$scontrol show job -d -o" if {$job_id_in ne ""} { append command " $job_id_in" } set output [run_command_output -fail "$command"] # Iterate over each job's parameter list foreach line [split $output "\n"] { if {$line eq ""} { continue } # Peel off the job parameters one at a time # The first quantifier sets the greediness for the whole RE while {[regexp {^ *?([^ =]+)=(.*)(?= +[^ =]+=| *$)} $line {} param_name param_value]} { # Remove the consumed parameter from the line set line [regsub {^ *?[^ =]+=.*(?= +[^ =]+=| *$)} $line {}] # Add it to the temporary job dictionary dict set job_dict $param_name $param_value } set job_id [dict get $job_dict "JobId"] # Add the job dictionary to jobs dictionary dict set jobs_dict $job_id $job_dict # Clear the job dictionary for the next job set job_dict {} } return $jobs_dict } ################################################################ # # NAME # get_job_param - returns a specific parameter value for a specific job # # SYNOPSIS # get_job_param job_id parameter_name # # DESCRIPTION # Returns a specific parameter value for a specified job if the # parameter exists for the job, or MISSING if it does not exist. # Specifying an invalid job id will result in a failure. # ################################################################ proc get_job_param { job_id parameter_name } { set jobs_dict [get_jobs $job_id] if [dict exists $jobs_dict $job_id $parameter_name] { return [dict get $jobs_dict $job_id $parameter_name] } else { return "MISSING" } } proc check_reason { job_id reason } { global squeue set found 0 spawn $squeue -j $job_id --noheader -o "%r" expect { -re "$reason" { set found 1 exp_continue } timeout { log_error "squeue not responding" } eof { wait } } if {$found == 0} { log_error "Job $job_id should have a wait reason of $reason" } return $found } ################################################################ # # NAME # submit_job - submits a job with sbatch and returns its job id # # SYNOPSIS # submit_job ?options? job_args # # DESCRIPTION # Submits a job with sbatch and returns its jobid, or 0 if error. # It accepts all the options of run_command. # # OPTIONS # It accpets and passes all the options of/to run_command. # # ARGUMENTS # job_args # a string containing all the arguments to pass to sbatch # # RETURN VALUE # the job id, or 0 if an error happen # ################################################################ proc submit_job args { global sbatch set job_id 0 if {[llength $args] < 1} { fail "Wrong number of parameters, should be >=1" } set job_options [lindex $args [expr [llength $args] - 1 ]] set run_options "" if {[llength $args] > 1} { set run_options [lrange $args 0 [expr [llength $args] - 2 ]] } set output [run_command_output {*}$run_options "$sbatch $job_options"] regexp {Submitted \S+ job (\d+)} $output - job_id return $job_id } ################################################################################ # # NAME # compile_against_libslurm - compiles a test program against either libslurm.so or libslurmfull.so # # SYNOPSIS # compile_against_libslurm test_prog ?use_full? ?build_args? # # DESCRIPTION # Compile a test program against either libslurm.so or libslurmfull.so. # # ARGUMENTS # test_prog # The name of the test program (and .c file) # use_full # If 1, use libslurmfull.so instead of libslurm.so. Defaults to 0. # build_args # Additional string to be appended to the build command. # E.g. "-DUSING_VALGRIND -lm ${build_dir}/src/slurmctld/locks.o" # (initial space will be added automatically). # ################################################################################ proc compile_against_libslurm { test_prog {use_full 0} {build_args ""} } { global slurm_dir bin_cc src_dir build_dir exit_code bin_chmod if {$use_full} { set libfile "libslurmfull.so" } else { set libfile "libslurm.so" } if [file exists $slurm_dir/lib64/slurm/$libfile] { set libdir "lib64" } else { set libdir "lib" } if {$use_full} { set libline "$slurm_dir/$libdir/slurm" set libfile "slurmfull" } else { set libline "$slurm_dir/$libdir" set libfile "slurm" } set build_cmd "$bin_cc ${test_prog}.c -g -pthread -o $test_prog -I$src_dir -I$build_dir -I$slurm_dir/include -Wl,-rpath=$libline -L$libline -l$libfile -lresolv" # Add additional arguments to the build command if {$build_args != ""} { set build_cmd "$build_cmd $build_args" } log_debug "Build command: $build_cmd" catch {exec {*}$build_cmd} out_str out_dict if {[dict get $out_dict -code]} { log_error $out_str set exit_code 1 return } exec $bin_chmod 700 $test_prog } ################################################################ # # NAME # subtest - tests a boolean condition and updates subtest tallies # # SYNOPSIS # subtest ?options? condition description ?diagnostics? # # DESCRIPTION # Based on the results of testing a boolean expression, increments the # relevant subtest count (pass, fail or skip) and logs a message. # # OPTIONS # -fatal # If the subtest fails, causes a fatal error ending the test # # ARGUMENTS # condition # The boolean expression to test # description # A single-line string describing what is being tested. This is # a subtest "name" that is displayed with the log message # whether the subtest passes or fails # diagnostics # A string providing additional diagnostic information that is # only included in the log message on failure # # ENVIRONMENT # testsuite_subtest_fatal # Specifies whether first failing subtest aborts the test # ################################################################ proc subtest args { set options [list] while {[llength $args]} { switch -glob -- [lindex $args 0] { -fatal - -fail { lappend options -fatal set args [lrange $args 1 end] } -* {fail "Unknown option: [lindex $args 0]"} default break } } set argument_count [llength $args] if {$argument_count < 2} { fail "Too few arguments ($argument_count): $args" } else { set args [lassign $args condition description] } if [uplevel 1 expr [format "{%s}" $condition]] { subpass $description } else { subfail {*}$options $description {*}$args } } ################################################################ # # NAME # _log_format - prints a log message with colorization and formatting # # SYNOPSIS # _log_format log_level message # # DESCRIPTION # This procedure is called by the log_ procedures and # derives the relevant log level from the caller's procedure name. # # ARGUMENTS # log_level # The logging threshold that triggered the log statement # message # The message to print with colorization and formatting # # ENVIRONMENT # testsuite_log_format # Used as the template for the fields to be output. # Fields must be expressed in the form: # %{} # Supported fields include the following: # message # The log message # filename # The file name where the log_ # procedure was called from # lineno # The line number where the log_ # procedure was called from # timestamp # The date and time when the log_ # procedure was called at # msecs # The milliseconds when the log_ # procedure was called at # loglevel # The log level that triggers the log_ # procedure to be called # backtrace # An abbreviated call stack trace with line # numbers # testsuite_time_format # Used as a template for the timestamp. See the format groups # for the tcl clock format command. # testsuite_colorize # Boolean that turns colorization on or off # testsuite_color_ # Can be set to define the color used for each log level # ################################################################ proc _log_format { log_level message } { global testsuite_colorize testsuite_log_format testsuite_time_format global COLOR_NONE global testsuite_color_fatal testsuite_color_error testsuite_color_warn global testsuite_color_info testsuite_color_debug testsuite_color_trace set format_string $testsuite_log_format set milliseconds_since_epoch [clock milliseconds] set date_time [clock format [expr {$milliseconds_since_epoch / 1000}] -format "$testsuite_time_format"] set milliseconds [expr {$milliseconds_since_epoch % 1000}] set frame_level -2 while { [dict get [info frame $frame_level] type] != "source" } { incr frame_level -1 } set format_args {} while {[regexp "%{\[a-z]+}" $format_string format_field]} { if {$format_field eq "%{message}"} { lappend format_args $message } elseif {$format_field eq "%{filename}"} { lappend format_args [file tail [dict get [info frame $frame_level] file]] } elseif {$format_field eq "%{lineno}"} { lappend format_args [dict get [info frame $frame_level] line] } elseif {$format_field eq "%{timestamp}"} { lappend format_args $date_time } elseif {$format_field eq "%{msecs}" || $format_field eq "%{milliseconds}"} { lappend format_args $milliseconds } elseif {$format_field eq "%{loglevel}" || $format_field eq "%{levelname}"} { lappend format_args [string totitle $log_level] } elseif {$format_field eq "%{backtrace}"} { lappend format_args [_line_trace] } else { fail "Invalid field ($format_field) specified in testsuite_log_format" } regsub $format_field $format_string "%" format_string } if ($testsuite_colorize) { switch $log_level { fatal { append output $testsuite_color_fatal } error { append output $testsuite_color_error } warning { append output $testsuite_color_warn } info { append output $testsuite_color_info } debug { append output $testsuite_color_debug } trace { append output $testsuite_color_trace } } } append output [format $format_string {*}$format_args] if ($testsuite_colorize) { append output $COLOR_NONE } puts $output } ################################################################ # # NAME # _print_header - prints a test header # # SYNOPSIS # _print_header # # ENVIRONMENT # testsuite_colorize # Boolean that turns colorization on or off # testsuite_color_header # Can be set to define the color used for the header # ################################################################ proc _print_header { } { global test_name testsuite_color_header testsuite_colorize COLOR_NONE if ($testsuite_colorize) { append output $testsuite_color_header } append output [string repeat = 78]\n append output [format "%-9s" "TEST:"]${test_name}\n append output [string repeat = 78] if ($testsuite_colorize) { append output $COLOR_NONE } puts $output } ################################################################ # # NAME # _print_summary - prints the final status summary # # SYNOPSIS # _print_summary status completed # # ARGUMENTS # status # The final status of the test. # When status is zero, we print SUCCESS. # When status is negative, we print SKIPPED. # When status is positive, we print FAILURE. # # completed # A boolean value that is true if the test completed and false # if aborted (ended early with exit status != 0) # # ENVIRONMENT # testsuite_colorize # Boolean that turns colorization on or off # testsuite_color_ # Can be set to define the color used for each test status # ################################################################ proc _print_summary {status completed} { global test_name testsuite_colorize COLOR_NONE global testsuite_color_success testsuite_color_skipped global testsuite_color_failure global subtest_fail_count subtest_pass_count subtest_skip_count if {$status == 0} { set color $testsuite_color_success set header "SUCCESS" } elseif {$status < 0} { set color $testsuite_color_skipped set header "SKIPPED" } elseif {$status > 0} { set color $testsuite_color_failure; set header "FAILURE" } if ($testsuite_colorize) { append output $color } append output [string repeat = 78]\n append output [format "%s : %s\n" $header $test_name] set subtest_count [expr $subtest_fail_count + $subtest_pass_count + $subtest_skip_count] if {$subtest_count > 0} { append output [format " Subtests failed : %d (%3d%%)\n" $subtest_fail_count [expr $subtest_fail_count * 100 / $subtest_count]] append output [format " Subtests skipped : %d (%3d%%)\n" $subtest_skip_count [expr $subtest_skip_count * 100 / $subtest_count]] append output [format " Subtests passed : %d (%3d%%)\n" $subtest_pass_count [expr $subtest_pass_count * 100 / $subtest_count]] append output [format " Subtests total : %d (%s)\n" $subtest_count [expr {$completed ? "COMPLETED" : "INCOMPLETE"}]] } append output [string repeat = 78] if ($testsuite_colorize) { append output $COLOR_NONE } puts $output } ################################################################ # # NAME # _get_test_name - gets the name of the invoking source script # # SYNOPSIS # _get_test_name # # RETURN VALUE # The name of the originally called script # ################################################################ proc _get_test_name { } { set test_name unknown set frame_level 1 while { $frame_level <= [info frame] } { if { [dict get [info frame $frame_level] type] == "source" } { set test_name [file tail [dict get [info frame $frame_level] file]] break } incr frame_level } return $test_name } ################################################################ # # NAME # _test_cleanup - performs the test cleanup # # SYNOPSIS # _test_cleanup # # DESCRIPTION # This procedure removes the temporary test_dir and calls the # test-defined cleanup procedure. # # NOTES # This function should be called only from _test_init and _exit. # ################################################################ proc _test_cleanup {} { global log_warn test_dir set rc 0 # Call global cleanup procedure if it is defined by the test if {[info procs cleanup] eq "cleanup"} { if {[catch {cleanup} cleanup_error ]} { log_error "Cleanup had errors: $cleanup_error" set rc 1 } } # Remove the temporary test directory exec rm -rf $test_dir return $rc } ################################################################ # # NAME # _test_init - performs test initialization # # SYNOPSIS # _test_init # # DESCRIPTION # This procedure is called automatically at the beginning of each test. # It prints the header, creates the temporary test dir, etc. # ################################################################ proc _test_init {} { global test_dir test_id test_name testsuite_shared_dir # Set test name to name of originally invoked test script, e.g. test1.1 set test_name [_get_test_name] # Set test id to suffix of the test script, e.g. 1.1 set test_id [string map {test ""} $test_name] # Temporary test directory used to stash saved configs, output files... set test_dir "$testsuite_shared_dir/${test_name}dir" # Print test header _print_header # Cleanup in case test was not cleaned up on last execution if { [_test_cleanup] } { fail "Error in the initial cleanup" } # Create temporary shared test directory exec mkdir -p $test_dir } ################################################################ # # NAME # _test_fini - performs test finalization # # SYNOPSIS # _test_fini ?status? # # DESCRIPTION # This procedure is called automatically from the ending functions # pass, skip and failure. # It cleans up based on the status and the testsuite_cleanup_on_failure # variable, prints the final test status/summary, and exits the test. # # We will always cleanup for SUCCESS or SKIPPED tests. # Whether or not the cleanup procedure is called for FAILURE tests # depends on the setting of the $testsuite_cleanup_on_failure variable # which can be set in the globals.local file or overridden with the # SLURM_TESTSUITE_CLEANUP_ON_FAILURE environment variable. # ################################################################ proc _test_fini { status } { global testsuite_cleanup_on_failure global subtest_fail_count subtest_skip_count # Determine if test completed or was aborted set completed [expr $status == 0 ? true : false] # Override status with subtest status if available and necessary if {$status < 1} { if {$subtest_fail_count > 0} { set status 1 } elseif {$subtest_skip_count > 0} { set status -1 } } # Only cleanup if test not failed or configured to do so if {$status != 1 || $testsuite_cleanup_on_failure} { _test_cleanup } _print_summary $status $completed __exit $status } ################################################################ # # Overload the exit routine to ensure that no one is explicitly # calling it, and to enforce _test_fini if exit is called when # the test reach its EOF. # # All tests should exit using pass, skip or fail. # ################################################################ rename exit __exit proc exit { {status 0} } { global test_name # # Minor sanity check to detect if exit was explicitly called (not # allowed) or automatically executed when the test ends # if {[info level] > 1} { # exit was called from a function, and it shouldn't log_error "Exit should not be directly called, use pass, skip or fail instead" } else { set frame_level 1 while { $frame_level <= [info frame] } { if { [dict get [info frame $frame_level] type] == "source" } { if { [file tail [dict get [info frame $frame_level] file]] eq $test_name } { # exit was called explicitly from the # test, and it shouldn't log_error "Exit should not be directly called, use pass, skip or fail instead" } break } incr frame_level } } # The exit was called implicitly when the test ends, allowed but # _test_fini call enforced _test_fini $status } # Call _test_init at the beginning of each test _test_init