#!/usr/bin/env expect
############################################################################
# Purpose: Test of Slurm functionality
#          Validate proper GRES operation under heavy load (many jobs)
#
# Output:  "TEST: #.#" followed by "SUCCESS" if test was successful, OR
#          "FAILURE: ..." otherwise with an explanation of the failure, OR
#          anything else indicates a failure mode that must be investigated.
############################################################################
# Copyright (C) 2018 SchedMD LLC
# Written by Morris Jette
#
# This file is part of Slurm, a resource management program.
# For details, see <https://slurm.schedmd.com/>.
# Please also read the included file: DISCLAIMER.
#
# Slurm is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along
# with Slurm; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA.
############################################################################
source ./globals

set test_id        "39.13"
set exit_code      0
set file_in1       "test$test_id.input1"
set file_in2       "test$test_id.input2"
set file_out       "test$test_id.output"
set number_commas  "\[0-9_,\]+"

print_header $test_id

if {[test_cons_tres]} {
	send_user "\nValid configuration, using select/cons_tres\n"
} else {
	send_user "\nWARNING: This test is only compatible with select/cons_tres\n"
	exit 0
}
if {[test_front_end]} {
	send_user "\nWARNING: This test is incompatible with front-end systems\n"
	exit $exit_code
}

set def_part_name [default_partition]
set nb_nodes [get_node_cnt_in_part $def_part_name]
if {$nb_nodes > 1} {
	set nb_nodes 2
}
set gpu_cnt [get_gpu_count $nb_nodes]
if {$gpu_cnt < 0} {
	send_user "\nFAILURE: Error getting GPU count\n"
	exit 1
}
if {$gpu_cnt < 1} {
	send_user "\nWARNING: This test requires 1 or more GPUs in the default partition\n"
	exit 0
}
get_node_config
send_user "\nGPU count is $gpu_cnt\n"
send_user "Sockets per node is $sockets_per_node\n"
send_user "CPUs per socket is $cpus_per_socket\n"

set tot_cpus [expr $sockets_per_node * $cpus_per_socket]
set tot_gpus $gpu_cnt
if {$nb_nodes > 1} {
	incr tot_gpus $gpu_cnt
}
if {$tot_gpus > 32} {
	set tot_gpus 32
}

#
# Build input script file
#
make_bash_script $file_in1 "$srun -l -N \$SLURM_NNODES -n \$SLURM_NNODES $file_in2
$scontrol -dd show job \$SLURM_JOB_ID | grep gpu
sleep 5
exit 0"

make_bash_script $file_in2 "echo HOST:\$SLURMD_NODENAME CUDA_VISIBLE_DEVICES:\$CUDA_VISIBLE_DEVICES"

#
# Submit job with various --gpus counters, up to 2 full nodes or 32 GPUs
#
#	spawn $sbatch --cpus-per-gpu=1 --gpus=$inx -t1 -w $hostname -J "test$test_id" --output=$ofile ./$file_in1
for {set inx 1} {$inx <= $tot_gpus} {incr inx} {
	set job_id 0
	set ofile ${file_out}.${inx}
	exec $bin_rm -f $ofile
	if {$tot_gpus > $tot_cpus} {
#		Assumes no configured DefCPUsPerGPU
		spawn $sbatch --gpus=$inx -t1 -w $hostname -J "test$test_id" --output=$ofile ./$file_in1
	} else {
		spawn $sbatch --cpus-per-gpu=1 --gpus=$inx -t1 -w $hostname -J "test$test_id" --output=$ofile ./$file_in1
	}
	expect {
		-re "Submitted batch job ($number)" {
			set job_id $expect_out(1,string)
			exp_continue
		}
		timeout {
			send_user "\nFAILURE: sbatch not responding\n"
			set exit_code 1
		}
		eof {
			wait
		}
	}
	if {$job_id == 0} {
		send_user "\nFAILURE: sbatch job submit failure\n"
		set exit_code 1
		set tot_gpus [expr $inx - 1]
		break
	}
}

#
# Check the output file contents to identify allocated GPUs
#
# Reset max_file_delay to avoid possibly huge delays here when tot_gpus is large
# Perform large delay at the start and small delays for each job's output
#
set save_max_file_delay $max_file_delay
if {$tot_gpus > 4 && $max_file_delay > 30} {
	sleep [expr $max_file_delay - 20]
	set max_file_delay 20
}
for {set inx 1} {$inx <= $tot_gpus} {incr inx} {
	set ofile ${file_out}.${inx}
	if {[wait_for_file $ofile] != 0} {
		send_user "\nFAILURE: output file $ofile not found\n"
		set exit_code 1
	} else {
		set match 0
		spawn $bin_cat $ofile
		expect {
			-re "CUDA_VISIBLE_DEVICES:($number_commas)" {
				incr match [cuda_count $expect_out(1,string)]
				exp_continue
			}
			eof {
				wait
			}
		}
		if {$match != $inx} {
			send_user "\nFAILURE: Bad GPU count in file $ofile ($match != $inx)\n"
			set exit_code 1
		} else {
			exec $bin_rm -f $ofile
		}
	}
}
set max_file_delay $save_max_file_delay

#
# Log any vestigial jobs and cancel them
#
if {$exit_code != 0} {
	spawn $squeue -tall -n "test$test_id"
	expect {
		timeout {
			send_user "\nFAILURE: squeue not responding\n"
			set exit_code 1
		}
		eof {
			wait
		}
	}
}
exec $scancel -n "test$test_id"

if {$exit_code == 0} {
	exec $bin_rm -f $file_in1 $file_in2
	send_user "\nSUCCESS\n"
}
exit $exit_code