/*****************************************************************************\ * jobacct_gather_cgroup_memory.c - memory cgroup subsystem for * jobacct_gather/cgroup ***************************************************************************** * Copyright (C) 2011 Bull * Written by Martin Perry (martin.perry@bull.com) based on code from * Matthieu Hautreux * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include #include /* getenv */ #include #include "slurm/slurm_errno.h" #include "slurm/slurm.h" #include "src/common/xstring.h" #include "src/plugins/jobacct_gather/cgroup/jobacct_gather_cgroup.h" #include "src/slurmd/slurmstepd/slurmstepd_job.h" #include "src/slurmd/slurmd/slurmd.h" static char user_cgroup_path[PATH_MAX]; static char job_cgroup_path[PATH_MAX]; static char jobstep_cgroup_path[PATH_MAX]; static char task_cgroup_path[PATH_MAX]; static xcgroup_ns_t memory_ns; static xcgroup_t user_memory_cg; static xcgroup_t job_memory_cg; static xcgroup_t step_memory_cg; List task_memory_cg_list = NULL; static uint32_t max_task_id; extern int jobacct_gather_cgroup_memory_init(void) { /* initialize user/job/jobstep cgroup relative paths */ user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; task_cgroup_path[0] ='\0'; /* initialize memory cgroup namespace */ if (xcgroup_ns_create(&memory_ns, "", "memory") != XCGROUP_SUCCESS) { error("jobacct_gather/cgroup: unable to create memory " "namespace"); return SLURM_ERROR; } FREE_NULL_LIST(task_memory_cg_list); task_memory_cg_list = list_create(free_task_cg_info); return SLURM_SUCCESS; } extern int jobacct_gather_cgroup_memory_fini(void) { xcgroup_t memory_cg; bool lock_ok; int cc; if (user_cgroup_path[0] == '\0' || job_cgroup_path[0] == '\0' || jobstep_cgroup_path[0] == '\0' || task_cgroup_path[0] == '\0') return SLURM_SUCCESS; /* Move the slurmstepd back to the root memory cg */ if (xcgroup_create(&memory_ns, &memory_cg, "", 0, 0) == XCGROUP_SUCCESS) xcgroup_set_uint32_param(&memory_cg, "tasks", getpid()); /* Lock the root of the cgroup and remove the subdirectories * related to this job. */ lock_ok = true; if (xcgroup_lock(&memory_cg) != XCGROUP_SUCCESS) { error("%s: failed to flock() %s %m", __func__, memory_cg.path); lock_ok = false; } /* Clean up starting from the leaves way up, the * reverse order in which the cgroups were created. * The debug2 messages are not errors as it is possible * that some other processes/plugins are accessing * some of those directories. The last one to leave * will clean it up, eventually the release_agent. */ for (cc = 0; cc <= max_task_id; cc++) { xcgroup_t cgroup; char *buf = NULL; /* rmdir all tasks this running slurmstepd * was responsible for. */ xstrfmtcat(buf, "%s%s/task_%d", memory_ns.mnt_point, jobstep_cgroup_path, cc); cgroup.path = buf; if (xcgroup_delete(&cgroup) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, buf); } xfree(buf); } /* * Clean the rest of the hierarchy. * Despite rmdir() offlines memcg, the memcg may still stay there due * to charged file caches. Some out-of-use page caches may keep charged * until memory pressure happens. Avoid this writting to 'force_empty'. * Note that when memory.kmem.limit_in_bytes is set the charges due to * kernel pages will still be seen. */ xcgroup_set_param(&step_memory_cg, "memory.force_empty", "1"); if (xcgroup_delete(&step_memory_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, step_memory_cg.path); } if (xcgroup_delete(&job_memory_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, job_memory_cg.path); } if (xcgroup_delete(&user_memory_cg) != XCGROUP_SUCCESS) { debug2("%s: failed to delete %s %m", __func__, user_memory_cg.path); } if (lock_ok == true) xcgroup_unlock(&memory_cg); xcgroup_destroy(&memory_cg); xcgroup_destroy(&user_memory_cg); xcgroup_destroy(&job_memory_cg); xcgroup_destroy(&step_memory_cg); FREE_NULL_LIST(task_memory_cg_list); user_cgroup_path[0]='\0'; job_cgroup_path[0]='\0'; jobstep_cgroup_path[0]='\0'; task_cgroup_path[0]='\0'; xcgroup_ns_destroy(&memory_ns); return SLURM_SUCCESS; } extern int jobacct_gather_cgroup_memory_attach_task(pid_t pid, jobacct_id_t *jobacct_id) { stepd_step_rec_t *job = jobacct_id->job; if (jobacct_id->taskid >= max_task_id) max_task_id = jobacct_id->taskid; debug("%s: %ps taskid %u max_task_id %u", __func__, &job->step_id, jobacct_id->taskid, max_task_id); return create_jobacct_cgroups(__func__, jobacct_id, pid, &memory_ns, &job_memory_cg, &step_memory_cg, task_memory_cg_list, &user_memory_cg, job_cgroup_path, jobstep_cgroup_path, task_cgroup_path, user_cgroup_path); }