/*****************************************************************************\
* task_affinity.c - Library for task pre-launch and post_termination
* functions for task affinity support
*****************************************************************************
* Copyright (C) 2005-2008 Hewlett-Packard Development Company, L.P.
* Modified by Hewlett-Packard for task affinity support using task_none.c
* Copyright (C) 2005-2007 The Regents of the University of California
* Copyright (C) 2008-2009 Lawrence Livermore National Security.
* Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see .
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
\*****************************************************************************/
#define _GNU_SOURCE
#include "config.h"
#include
#include
#include
#include
#include
#include "affinity.h"
#include "dist_tasks.h"
#include "src/slurmd/common/task_plugin.h"
/* Enable purging of cpuset directories
* after each task and the step are done.
*/
#define PURGE_CPUSET_DIRS 1
/*
* These variables are required by the generic plugin interface. If they
* are not found in the plugin, the plugin loader will ignore it.
*
* plugin_name - a string giving a human-readable description of the
* plugin. There is no maximum length, but the symbol must refer to
* a valid string.
*
* plugin_type - a string suggesting the type of the plugin or its
* applicability to a particular form of data or method of data handling.
* If the low-level plugin API is used, the contents of this string are
* unimportant and may be anything. Slurm uses the higher-level plugin
* interface which requires this string to be of the form
*
* /
*
* where is a description of the intended application of
* the plugin (e.g., "task" for task control) and is a description
* of how this plugin satisfies that application. Slurm will only load
* a task plugin if the plugin_type string has a prefix of "task/".
*
* plugin_version - an unsigned 32-bit integer containing the Slurm version
* (major.minor.micro combined into a single number).
*/
const char plugin_name[] = "task affinity plugin";
const char plugin_type[] = "task/affinity";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
/*
* init() is called when the plugin is loaded, before any other functions
* are called. Put global initialization here.
*/
extern int init (void)
{
cpu_set_t cur_mask;
char mstr[1 + CPU_SETSIZE / 4];
slurm_getaffinity(0, sizeof(cur_mask), &cur_mask);
task_cpuset_to_str(&cur_mask, mstr);
verbose("%s loaded with CPU mask 0x%s", plugin_name, mstr);
return SLURM_SUCCESS;
}
/*
* fini() is called when the plugin is removed. Clear any allocated
* storage here.
*/
extern int fini (void)
{
debug("%s unloaded", plugin_name);
return SLURM_SUCCESS;
}
/* cpu bind enforcement, update binding type based upon the
* TaskPluginParam configuration parameter */
static void _update_bind_type(launch_tasks_request_msg_t *req)
{
bool set_bind = false;
if ((req->cpu_bind_type & (~CPU_BIND_VERBOSE)) == 0) {
if (conf->task_plugin_param & CPU_BIND_NONE) {
req->cpu_bind_type |= CPU_BIND_NONE;
req->cpu_bind_type &= (~CPU_BIND_TO_SOCKETS);
req->cpu_bind_type &= (~CPU_BIND_TO_CORES);
req->cpu_bind_type &= (~CPU_BIND_TO_THREADS);
req->cpu_bind_type &= (~CPU_BIND_TO_LDOMS);
set_bind = true;
} else if (conf->task_plugin_param & CPU_BIND_TO_SOCKETS) {
req->cpu_bind_type &= (~CPU_BIND_NONE);
req->cpu_bind_type |= CPU_BIND_TO_SOCKETS;
req->cpu_bind_type &= (~CPU_BIND_TO_CORES);
req->cpu_bind_type &= (~CPU_BIND_TO_THREADS);
req->cpu_bind_type &= (~CPU_BIND_TO_LDOMS);
set_bind = true;
} else if (conf->task_plugin_param & CPU_BIND_TO_CORES) {
req->cpu_bind_type &= (~CPU_BIND_NONE);
req->cpu_bind_type &= (~CPU_BIND_TO_SOCKETS);
req->cpu_bind_type |= CPU_BIND_TO_CORES;
req->cpu_bind_type &= (~CPU_BIND_TO_THREADS);
req->cpu_bind_type &= (~CPU_BIND_TO_LDOMS);
set_bind = true;
} else if (conf->task_plugin_param & CPU_BIND_TO_THREADS) {
req->cpu_bind_type &= (~CPU_BIND_NONE);
req->cpu_bind_type &= (~CPU_BIND_TO_SOCKETS);
req->cpu_bind_type &= (~CPU_BIND_TO_CORES);
req->cpu_bind_type |= CPU_BIND_TO_THREADS;
req->cpu_bind_type &= (~CPU_BIND_TO_LDOMS);
set_bind = true;
} else if (conf->task_plugin_param & CPU_BIND_TO_LDOMS) {
req->cpu_bind_type &= (~CPU_BIND_NONE);
req->cpu_bind_type &= (~CPU_BIND_TO_SOCKETS);
req->cpu_bind_type &= (~CPU_BIND_TO_CORES);
req->cpu_bind_type &= (~CPU_BIND_TO_THREADS);
req->cpu_bind_type &= CPU_BIND_TO_LDOMS;
set_bind = true;
}
}
if (conf->task_plugin_param & CPU_BIND_VERBOSE) {
req->cpu_bind_type |= CPU_BIND_VERBOSE;
set_bind = true;
}
if (set_bind) {
char bind_str[128];
slurm_sprint_cpu_bind_type(bind_str, req->cpu_bind_type);
info("task affinity : enforcing '%s' cpu bind method",
bind_str);
}
}
/*
* task_p_slurmd_batch_request()
*/
extern int task_p_slurmd_batch_request (batch_job_launch_msg_t *req)
{
info("task_p_slurmd_batch_request: %u", req->job_id);
batch_bind(req);
return SLURM_SUCCESS;
}
/*
* task_p_slurmd_launch_request()
*/
extern int task_p_slurmd_launch_request (launch_tasks_request_msg_t *req,
uint32_t node_id)
{
char buf_type[100];
if (((conf->sockets >= 1)
&& ((conf->cores > 1) || (conf->threads > 1)))
|| (!(req->cpu_bind_type & CPU_BIND_NONE))) {
_update_bind_type(req);
slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
debug("task affinity : before lllp distribution cpu bind "
"method is '%s' (%s)", buf_type, req->cpu_bind);
lllp_distribution(req, node_id);
slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
debug("task affinity : after lllp distribution cpu bind "
"method is '%s' (%s)", buf_type, req->cpu_bind);
}
return SLURM_SUCCESS;
}
/*
* task_p_slurmd_reserve_resources()
*/
extern int task_p_slurmd_reserve_resources (launch_tasks_request_msg_t *req,
uint32_t node_id)
{
debug("task_p_slurmd_reserve_resources: %u", req->job_id);
return SLURM_SUCCESS;
}
/*
* task_p_slurmd_suspend_job()
*/
extern int task_p_slurmd_suspend_job (uint32_t job_id)
{
debug("task_p_slurmd_suspend_job: %u", job_id);
return SLURM_SUCCESS;
}
/*
* task_p_slurmd_resume_job()
*/
extern int task_p_slurmd_resume_job (uint32_t job_id)
{
debug("task_p_slurmd_resume_job: %u", job_id);
return SLURM_SUCCESS;
}
/*
* task_p_slurmd_release_resources()
*/
extern int task_p_slurmd_release_resources (uint32_t job_id)
{
DIR *dirp;
struct dirent *entryp;
char base[PATH_MAX];
char path[PATH_MAX];
debug("%s: affinity jobid %u", __func__, job_id);
#if PURGE_CPUSET_DIRS
/* NOTE: The notify_on_release flag set in cpuset.c
* should remove the directory, but that is not
* happening reliably. */
if (! (conf->task_plugin_param & CPU_BIND_CPUSETS))
return SLURM_SUCCESS;
#ifdef MULTIPLE_SLURMD
if (snprintf(base, PATH_MAX, "%s/slurm_%s_%u",
CPUSET_DIR,
(conf->node_name != NULL)?conf->node_name:"",
job_id) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
return SLURM_ERROR;
}
#else
if (snprintf(base, PATH_MAX, "%s/slurm%u",
CPUSET_DIR, job_id) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
return SLURM_ERROR;
}
#endif
if (rmdir(base) == 0)
return SLURM_SUCCESS;
/* EBUSY Attempted to remove, using rmdir(2),
* a cpuset with child cpusets. ENOTEMPTY?
*/
if (errno != ENOTEMPTY
&& errno != EBUSY) {
error("%s: rmdir(%s) failed %m", __func__, base);
return SLURM_ERROR;
}
/* errno == ENOTEMPTY
*/
if ((dirp = opendir(base)) == NULL) {
error("%s: could not open dir %s: %m", __func__, base);
return SLURM_ERROR;
}
while (1) {
if (!(entryp = readdir(dirp)))
break;
if (xstrncmp(entryp->d_name, "slurm", 5))
continue;
if (snprintf(path, PATH_MAX, "%s/%s",
base, entryp->d_name) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
break;
}
if (rmdir(path) != 0) {
error("%s: rmdir(%s) failed %m", __func__, base);
closedir(dirp);
return SLURM_ERROR;
}
}
closedir(dirp);
if (rmdir(base) != 0) {
error("%s: rmdir(%s) failed %m", __func__, base);
return SLURM_ERROR;
}
#endif
return SLURM_SUCCESS;
}
/*
* task_p_pre_setuid() is called before setting the UID for the
* user to launch his jobs. Use this to create the CPUSET directory
* and set the owner appropriately.
*/
extern int task_p_pre_setuid (stepd_step_rec_t *job)
{
char path[PATH_MAX];
int rc = SLURM_SUCCESS;
if (conf->task_plugin_param & CPU_BIND_CPUSETS) {
#ifdef MULTIPLE_SLURMD
if (snprintf(path, PATH_MAX, "%s/slurm_%s_%u",
CPUSET_DIR,
(conf->node_name != NULL)?conf->node_name:"",
job->jobid) > PATH_MAX) {
error("%s: cpuset path too long", __func__);
rc = SLURM_ERROR;
}
#else
if (snprintf(path, PATH_MAX, "%s/slurm%u",
CPUSET_DIR, job->jobid) > PATH_MAX) {
error("%s: cpuset path too long", __func__);
rc = SLURM_ERROR;
}
#endif
if (rc == SLURM_SUCCESS) {
rc = slurm_build_cpuset(CPUSET_DIR, path, job->uid,
job->gid);
if (rc != SLURM_SUCCESS) {
error("%s: slurm_build_cpuset() failed",
__func__);
}
}
}
if (rc == SLURM_SUCCESS)
cpu_freq_cpuset_validate(job);
return rc;
}
#ifdef HAVE_NUMA
static void _numa_set_preferred(nodemask_t *new_mask)
{
int i;
for (i = 0; i < NUMA_NUM_NODES; i++) {
if (nodemask_isset(new_mask, i)) {
numa_set_preferred(i);
break;
}
}
}
#endif
/*
* task_p_pre_launch() is called prior to exec of application task.
* It is followed by TaskProlog program (from slurm.conf) and
* --task-prolog (from srun command line).
*/
extern int task_p_pre_launch (stepd_step_rec_t *job)
{
char base[PATH_MAX], path[PATH_MAX];
int rc = SLURM_SUCCESS;
debug("%s: affinity jobid %u.%u, task:%u bind:%u",
__func__, job->jobid, job->stepid,
job->envtp->procid, job->cpu_bind_type);
if (conf->task_plugin_param & CPU_BIND_CPUSETS) {
info("%s: Using cpuset affinity for tasks", __func__);
#ifdef MULTIPLE_SLURMD
if (snprintf(base, PATH_MAX, "%s/slurm_%s_%u",
CPUSET_DIR,
(conf->node_name != NULL)?conf->node_name:"",
job->jobid) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
return SLURM_ERROR;
}
#else
if (snprintf(base, PATH_MAX, "%s/slurm%u",
CPUSET_DIR, job->jobid) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
return SLURM_ERROR;
}
#endif
if (snprintf(path, PATH_MAX, "%s/slurm%u.%u_%d",
base, job->jobid, job->stepid,
job->envtp->localid) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
return SLURM_ERROR;
}
} else
info("%s: Using sched_affinity for tasks", __func__);
/*** CPU binding support ***/
if (job->cpu_bind_type) {
cpu_set_t new_mask, cur_mask;
pid_t mypid = job->envtp->task_pid;
slurm_getaffinity(mypid, sizeof(cur_mask), &cur_mask);
if (get_cpuset(&new_mask, job) &&
(!(job->cpu_bind_type & CPU_BIND_NONE))) {
reset_cpuset(&new_mask, &cur_mask);
if (conf->task_plugin_param & CPU_BIND_CPUSETS) {
rc = slurm_set_cpuset(base, path, mypid,
sizeof(new_mask),
&new_mask);
slurm_get_cpuset(path, mypid,
sizeof(cur_mask),
&cur_mask);
} else {
rc = slurm_setaffinity(mypid,
sizeof(new_mask),
&new_mask);
slurm_getaffinity(mypid,
sizeof(cur_mask),
&cur_mask);
}
}
task_slurm_chkaffinity(rc ? &cur_mask : &new_mask,
job, rc);
} else if (job->mem_bind_type &&
(conf->task_plugin_param & CPU_BIND_CPUSETS)) {
cpu_set_t cur_mask;
pid_t mypid = job->envtp->task_pid;
/* Establish cpuset just for the memory binding */
slurm_getaffinity(mypid, sizeof(cur_mask), &cur_mask);
rc = slurm_set_cpuset(base, path,
(pid_t) job->envtp->task_pid,
sizeof(cur_mask), &cur_mask);
}
#ifdef HAVE_NUMA
if ((conf->task_plugin_param & CPU_BIND_CPUSETS) &&
(slurm_memset_available() >= 0)) {
nodemask_t new_mask, cur_mask;
cur_mask = numa_get_membind();
if (get_memset(&new_mask, job) &&
(!(job->mem_bind_type & MEM_BIND_NONE))) {
slurm_set_memset(path, &new_mask);
if (numa_available() >= 0) {
if (job->mem_bind_type & MEM_BIND_PREFER)
_numa_set_preferred(&new_mask);
else
numa_set_membind(&new_mask);
}
cur_mask = new_mask;
}
slurm_chk_memset(&cur_mask, job);
} else if (job->mem_bind_type && (numa_available() >= 0)) {
nodemask_t new_mask, cur_mask;
cur_mask = numa_get_membind();
if (get_memset(&new_mask, job)
&& (!(job->mem_bind_type & MEM_BIND_NONE))) {
if (job->mem_bind_type & MEM_BIND_PREFER)
_numa_set_preferred(&new_mask);
else
numa_set_membind(&new_mask);
cur_mask = new_mask;
}
slurm_chk_memset(&cur_mask, job);
}
#endif
return rc;
}
/*
* task_p_pre_launch_priv() is called prior to exec of application task.
* in privileged mode, just after slurm_spank_task_init_privileged
*/
extern int task_p_pre_launch_priv(stepd_step_rec_t *job, pid_t pid)
{
return SLURM_SUCCESS;
}
/*
* task_term() is called after termination of application task.
* It is preceded by --task-epilog (from srun command line)
* followed by TaskEpilog program (from slurm.conf).
*/
extern int task_p_post_term (stepd_step_rec_t *job, stepd_step_task_info_t *task)
{
char base[PATH_MAX], path[PATH_MAX];
debug("%s: affinity %u.%u, task %d",
__func__, job->jobid, job->stepid, task->id);
#if PURGE_CPUSET_DIRS
/* NOTE: The notify_on_release flag set in cpuset.c
* should remove the directory, but that is not
* happening reliably. */
if (! (conf->task_plugin_param & CPU_BIND_CPUSETS))
return SLURM_SUCCESS;
#ifdef MULTIPLE_SLURMD
if (snprintf(base, PATH_MAX, "%s/slurm_%s_%u",
CPUSET_DIR,
(conf->node_name != NULL)?conf->node_name:"",
job->jobid) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
return SLURM_ERROR;
}
#else
if (snprintf(base, PATH_MAX, "%s/slurm%u",
CPUSET_DIR, job->jobid) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
return SLURM_ERROR;
}
#endif
if (snprintf(path, PATH_MAX, "%s/slurm%u.%u_%d",
base, job->jobid, job->stepid,
task->id) >= PATH_MAX) {
error("%s: cpuset path too long", __func__);
return SLURM_ERROR;
}
/* Only error out if it failed to remove the cpuset dir. The cpuset
* dir may have already been removed by the release_agent. */
if (rmdir(path) != 0 && errno != ENOENT) {
error("%s: rmdir(%s) failed %m", __func__, path);
return SLURM_ERROR;
}
#endif
return SLURM_SUCCESS;
}
/*
* task_p_post_step() is called after termination of the step
* (all the task)
*/
extern int task_p_post_step (stepd_step_rec_t *job)
{
return SLURM_SUCCESS;
}
/*
* Keep track a of a pid.
*/
extern int task_p_add_pid (pid_t pid)
{
return SLURM_SUCCESS;
}