#!/bin/bash
#
# Sample Prolog to start and quit the MPS server as needed
# NOTE: This is only a sample and may need modification for your environment
#

# Specify default locations of file where script tracks the MPS device ID
MPS_DEV_ID_FILE="/var/run/mps_dev_id"

# Specify directory where MPS and Slurm commands are located (if not in search path)
#MPS_CMD_DIR="/usr/bin/"
#SLURM_CMD_DIR="/usr/bin/"

# Determine which GPU the MPS server is running on
if [ -f ${MPS_DEV_ID_FILE} ]; then
	MPS_DEV_ID=$(cat ${MPS_DEV_ID_FILE})
else
	MPS_DEV_ID=""
fi

# If job requires MPS, determine if it is running now on wrong (old) GPU assignment
unset KILL_MPS_SERVER
if [ -n "${CUDA_VISIBLE_DEVICES}" ] &&
   [ -n "${CUDA_MPS_ACTIVE_THREAD_PERCENTAGE}" ] &&
   [[ ${CUDA_VISIBLE_DEVICES} != ${MPS_DEV_ID} ]]; then
	KILL_MPS_SERVER=1
# If job requires full GPU(s) then kill the MPS server if it is still running
# on any of the GPUs allocated to this job.
# This string compare assumes there are not more than 10 GPUs per node.
elif [ -n "${CUDA_VISIBLE_DEVICES}" ] &&
     [ -z "${CUDA_MPS_ACTIVE_THREAD_PERCENTAGE}" ] &&
     [[ ${CUDA_VISIBLE_DEVICES} == *${MPS_DEV_ID}* ]]; then
	KILL_MPS_SERVER=1
fi

if [ -n "${KILL_MPS_SERVER}" ]; then
	echo -1 >${MPS_DEV_ID_FILE}
	# Determine if MPS server is running
	ps aux | grep nvidia-cuda-mps-control | grep -v grep > /dev/null
	if [ $? -eq 0 ]; then
		echo "Stopping MPS control daemon"
		# Reset GPU mode to default
		${MPS_CMD_DIR}nvidia-smi -c ${CUDA_VISIBLE_DEVICES}
		# Quit MPS server daemon
		echo quit | ${MPS_CMD_DIR}nvidia-cuda-mps-control
		# Test for presence of MPS zombie process
		ps aux | grep nvidia-cuda-mps | grep -v grep > /dev/null
		if [ $? -eq 0 ]; then
			logger "`hostname` Slurm Prolog: MPS refusing to quit! Downing node"
			${SLURM_CMD_DIR}scontrol update nodename=${SLURMD_NODENAME} State=DOWN Reason="MPS not quitting"
		fi
		# Check GPU sanity, simple check
		${MPS_CMD_DIR}nvidia-smi > /dev/null
		if [ $? -ne 0 ]; then
			logger "`hostname` Slurm Prolog: GPU not operational! Downing node"
			${SLURM_CMD_DIR}scontrol update nodename=${SLURMD_NODENAME} State=DOWN Reason="GPU not operational"
		fi
	fi
fi

# If job requires MPS then write device ID to file and start server as needed
# If server is already running the start requests just return with an error
if [ -n "${CUDA_VISIBLE_DEVICES}" ] &&
   [ -n "${CUDA_MPS_ACTIVE_THREAD_PERCENTAGE}" ]; then
	echo ${CUDA_VISIBLE_DEVICES} >${MPS_DEV_ID_FILE}
	unset CUDA_MPS_ACTIVE_THREAD_PERCENTAGE
	export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps_${CUDA_VISIBLE_DEVICES}
	export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log_${CUDA_VISIBLE_DEVICES}
	${MPS_CMD_DIR}nvidia-cuda-mps-control -d && echo "MPS control daemon started"
	sleep 1
	${MPS_CMD_DIR}nvidia-cuda-mps-control start_server -uid $SLURM_JOB_UID && echo "MPS server started for $SLURM_JOB_UID"
fi

exit 0