/*****************************************************************************\
* acct_gather_energy_rapl.c - slurm energy accounting plugin for rapl.
*****************************************************************************
* Copyright (C) 2012
* Written by Bull- Yiannis Georgiou
* CODE-OCEC-09-009. All rights reserved.
*
* This file is part of Slurm, a resource management program.
* For details, see .
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* This file is patterned after jobcomp_linux.c, written by Morris Jette and
* Copyright (C) 2002 The Regents of the University of California.
\*****************************************************************************/
/* acct_gather_energy_rapl
* This plugin does not initiate a node-level thread.
* It will be used to load energy values from cpu/core
* sensors when harware/drivers are available
*/
#include
#include
#include "src/common/slurm_xlator.h"
#include "src/common/slurm_acct_gather_energy.h"
#include "src/common/slurm_protocol_api.h"
#include "src/common/slurm_protocol_defs.h"
#include "src/common/fd.h"
#include "src/slurmd/common/proctrack.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#define MAX_PKGS 256
#define MSR_RAPL_POWER_UNIT 0x606
/* Package RAPL Domain */
#define MSR_PKG_RAPL_POWER_LIMIT 0x610
#define MSR_PKG_ENERGY_STATUS 0x611
#define MSR_PKG_PERF_STATUS 0x613
#define MSR_PKG_POWER_INFO 0x614
/* DRAM RAPL Domain */
#define MSR_DRAM_POWER_LIMIT 0x618
#define MSR_DRAM_ENERGY_STATUS 0x619
#define MSR_DRAM_PERF_STATUS 0x61B
#define MSR_DRAM_POWER_INFO 0x61C
union {
uint64_t val;
struct {
uint32_t low;
uint32_t high;
} i;
} package_energy[MAX_PKGS], dram_energy[MAX_PKGS];
#define _DEBUG 1
#define _DEBUG_ENERGY 1
/*
* These variables are required by the generic plugin interface. If they
* are not found in the plugin, the plugin loader will ignore it.
*
* plugin_name - a string giving a human-readable description of the
* plugin. There is no maximum length, but the symbol must refer to
* a valid string.
*
* plugin_type - a string suggesting the type of the plugin or its
* applicability to a particular form of data or method of data handling.
* If the low-level plugin API is used, the contents of this string are
* unimportant and may be anything. Slurm uses the higher-level plugin
* interface which requires this string to be of the form
*
* /
*
* where is a description of the intended application of
* the plugin (e.g., "jobacct" for Slurm job completion logging) and
* is a description of how this plugin satisfies that application. Slurm will
* only load job completion logging plugins if the plugin_type string has a
* prefix of "jobacct/".
*
* plugin_version - an unsigned 32-bit integer containing the Slurm version
* (major.minor.micro combined into a single number).
*/
const char plugin_name[] = "AcctGatherEnergy RAPL plugin";
const char plugin_type[] = "acct_gather_energy/rapl";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
static acct_gather_energy_t *local_energy = NULL;
static int dataset_id = -1; /* id of the dataset for profile data */
/* one cpu in the package */
static int pkg2cpu[MAX_PKGS] = {[0 ... MAX_PKGS-1] = -1};
static int pkg_fd[MAX_PKGS] = {[0 ... MAX_PKGS-1] = -1};
static char hostname[MAXHOSTNAMELEN];
static int nb_pkg = 0;
static stepd_step_rec_t *job = NULL;
extern void acct_gather_energy_p_conf_set(
int context_id_in, s_p_hashtbl_t *tbl);
static char *_msr_string(int which)
{
if (which == MSR_RAPL_POWER_UNIT)
return "PowerUnit";
else if (which == MSR_PKG_POWER_INFO)
return "PowerInfo";
return "UnknownType";
}
static uint64_t _read_msr(int fd, int which)
{
uint64_t data = 0;
static bool first = true;
if (lseek(fd, which, SEEK_SET) < 0)
error("lseek of /dev/cpu/#/msr: %m");
if (read(fd, &data, sizeof(data)) != sizeof(data)) {
if (which == MSR_DRAM_ENERGY_STATUS) {
if (first &&
(slurm_conf.debug_flags & DEBUG_FLAG_ENERGY)) {
first = false;
info("It appears you don't have any DRAM, "
"this can be common. Check your system "
"if you think this is in error.");
}
} else {
debug("Check if your CPU has RAPL support for %s: %m",
_msr_string(which));
}
}
return data;
}
static uint64_t _get_package_energy(int pkg)
{
uint64_t result;
/*
* MSR_PKG_ENERGY_STATUS
* Total Energy Consumed - bits 31:0
* Reserved - bits 63:32
* See: Intel 64 and IA-32 Architectures Software Developer's
* Manual, Volume 3 for details
*/
result = _read_msr(pkg_fd[pkg], MSR_PKG_ENERGY_STATUS);
result &= 0xffffffff;
if (result < package_energy[pkg].i.low)
package_energy[pkg].i.high++;
package_energy[pkg].i.low = result;
return(package_energy[pkg].val);
}
static uint64_t _get_dram_energy(int pkg)
{
uint64_t result;
/*
* MSR_DRAM_ENERGY_STATUS
* Total Energy Consumed - bits 31:0
* Reserved - bits 63:32
* See: Intel 64 and IA-32 Architectures Software Developer's
* Manual, Volume 3 for details
*/
result = _read_msr(pkg_fd[pkg], MSR_DRAM_ENERGY_STATUS);
result &= 0xffffffff;
if (result < dram_energy[pkg].i.low)
dram_energy[pkg].i.high++;
dram_energy[pkg].i.low = result;
return(dram_energy[pkg].val);
}
static int _open_msr(int core)
{
char msr_filename[BUFSIZ];
int fd;
sprintf(msr_filename, "/dev/cpu/%d/msr", core);
fd = open(msr_filename, O_RDONLY);
if (fd < 0) {
if ( errno == ENXIO ) {
error("No CPU %d", core);
} else if ( errno == EIO ) {
error("CPU %d doesn't support MSRs", core);
} else
error("MSR register problem (%s): %m", msr_filename);
} else {
/*
* If this is loaded in the slurmd we need to make sure it
* gets closed when a slurmstepd launches.
*/
fd_set_close_on_exec(fd);
}
return fd;
}
static void _hardware(void)
{
char buf[1024];
FILE *fd;
int cpu = -1, pkg = -1;
if ((fd = fopen("/proc/cpuinfo", "r")) == 0)
fatal("RAPL: error on attempt to open /proc/cpuinfo");
while (fgets(buf, 1024, fd)) {
if (!xstrncmp(buf, "processor", sizeof("processor") - 1)) {
sscanf(buf, "processor\t: %d", &cpu);
continue;
}
if (!xstrncmp(buf, "physical id", sizeof("physical id") - 1)) {
sscanf(buf, "physical id\t: %d", &pkg);
if (cpu < 0) {
error("%s: No processor ID found", plugin_name);
} else if (pkg < 0) {
error("%s: No physical ID found", plugin_name);
} else if (pkg >= MAX_PKGS) {
fatal("%s: Configured for up to %d sockets and you have %d. "
"Update src/plugins/acct_gather_energy/"
"rapl/acct_gather_energy_rapl.h "
"(MAX_PKGS) and recompile.",
plugin_name, MAX_PKGS, pkg);
} else if (pkg2cpu[pkg] == -1) {
nb_pkg++;
pkg2cpu[pkg] = cpu;
}
continue;
}
}
fclose(fd);
log_flag(ENERGY, "RAPL Found: %d packages", nb_pkg);
}
/*
* _send_drain_request()
*/
static void
_send_drain_request(void)
{
update_node_msg_t node_msg;
static char drain_request_sent;
if (drain_request_sent)
return;
slurm_init_update_node_msg(&node_msg);
node_msg.node_names = hostname;
node_msg.reason = "Cannot collect energy data.";
node_msg.node_state = NODE_STATE_DRAIN;
drain_request_sent = 1;
debug("%s: sending NODE_STATE_DRAIN to controller", __func__);
if (slurm_update_node(&node_msg) != SLURM_SUCCESS) {
error("%s: Unable to drain node %s: %m", __func__, hostname);
drain_request_sent = 0;
}
}
static void _get_joules_task(acct_gather_energy_t *energy)
{
int i;
double energy_units;
uint64_t result;
double ret;
static uint32_t readings = 0;
if (pkg_fd[0] < 0) {
error("%s: device /dev/cpu/#/msr not opened "
"energy data cannot be collected.", __func__);
_send_drain_request();
return;
}
/*
* MSR_RAPL_POWER_UNIT
* Power Units - bits 3:0
* Energy Status Units - bits 12:8
* Time Units - bits 19:16
* See: Intel 64 and IA-32 Architectures Software Developer's
* Manual, Volume 3 for details
*/
result = _read_msr(pkg_fd[0], MSR_RAPL_POWER_UNIT);
energy_units = pow(0.5, (double)((result>>8)&0x1f));
if (slurm_conf.debug_flags & DEBUG_FLAG_ENERGY) {
double power_units = pow(0.5, (double)(result&0xf));
unsigned long max_power;
info("RAPL powercapture_debug Energy units = %.6f, "
"Power Units = %.6f", energy_units, power_units);
/*
* MSR_PKG_POWER_INFO
* Thermal Spec Power - bits 14:0
* Minimum Power - bits 30:16
* Maximum Power - bits 46:32
* Maximum Time Window - bits 53:48
* See: Intel 64 and IA-32 Architectures Software Developer's
* Manual, Volume 3 for details
*/
result = _read_msr(pkg_fd[0], MSR_PKG_POWER_INFO);
max_power = power_units * ((result >> 32) & 0x7fff);
info("RAPL Max power = %ld w", max_power);
}
result = 0;
for (i = 0; i < nb_pkg; i++)
result += _get_package_energy(i) + _get_dram_energy(i);
ret = (double)result * energy_units;
log_flag(ENERGY, "RAPL Result %"PRIu64" = %.6f Joules", result, ret);
if (energy->consumed_energy) {
time_t interval;
energy->consumed_energy =
(uint64_t)ret - energy->base_consumed_energy;
energy->current_watts =
(uint32_t)ret - energy->previous_consumed_energy;
energy->ave_watts = ((energy->ave_watts * readings) +
energy->current_watts) / (readings + 1);
interval = time(NULL) - energy->poll_time;
if (interval) /* Prevent divide by zero */
energy->current_watts /= (float)interval;
} else {
energy->consumed_energy = 1;
energy->base_consumed_energy = (uint64_t)ret;
energy->ave_watts = 0;
}
readings++;
energy->previous_consumed_energy = (uint64_t)ret;
energy->poll_time = time(NULL);
log_flag(ENERGY, "%s: current %.6f Joules, consumed %"PRIu64"",
__func__, ret, energy->consumed_energy);
}
static int _running_profile(void)
{
static bool run = false;
static uint32_t profile_opt = ACCT_GATHER_PROFILE_NOT_SET;
if (profile_opt == ACCT_GATHER_PROFILE_NOT_SET) {
acct_gather_profile_g_get(ACCT_GATHER_PROFILE_RUNNING,
&profile_opt);
if (profile_opt & ACCT_GATHER_PROFILE_ENERGY)
run = true;
}
return run;
}
static int _send_profile(void)
{
uint64_t curr_watts;
acct_gather_profile_dataset_t dataset[] = {
{ "Power", PROFILE_FIELD_UINT64 },
{ NULL, PROFILE_FIELD_NOT_SET }
};
if (!_running_profile())
return SLURM_SUCCESS;
log_flag(ENERGY, "%s: consumed %u watts",
__func__, local_energy->current_watts);
if (dataset_id < 0) {
dataset_id = acct_gather_profile_g_create_dataset(
"Energy", NO_PARENT, dataset);
log_flag(ENERGY, "Energy: dataset created (id = %d)",
dataset_id);
if (dataset_id == SLURM_ERROR) {
error("Energy: Failed to create the dataset for RAPL");
return SLURM_ERROR;
}
}
curr_watts = (uint64_t)local_energy->current_watts;
log_flag(PROFILE, "PROFILE-Energy: power=%u",
local_energy->current_watts);
return acct_gather_profile_g_add_sample_data(dataset_id,
(void *)&curr_watts,
local_energy->poll_time);
}
extern int acct_gather_energy_p_update_node_energy(void)
{
int rc = SLURM_SUCCESS;
xassert(running_in_slurmd_stepd());
if (!local_energy) {
debug("%s: trying to update node energy, but no local_energy "
"yet.", __func__);
acct_gather_energy_p_conf_set(0, NULL);
}
if (local_energy->current_watts == NO_VAL)
return rc;
_get_joules_task(local_energy);
return rc;
}
/*
* init() is called when the plugin is loaded, before any other functions
* are called. Put global initialization here.
*/
extern int init(void)
{
gethostname(hostname, MAXHOSTNAMELEN);
/* put anything that requires the .conf being read in
acct_gather_energy_p_conf_parse
*/
return SLURM_SUCCESS;
}
extern int fini(void)
{
int i;
if (!running_in_slurmd_stepd())
return SLURM_SUCCESS;
for (i = 0; i < nb_pkg; i++) {
if (pkg_fd[i] != -1) {
close(pkg_fd[i]);
pkg_fd[i] = -1;
}
}
acct_gather_energy_destroy(local_energy);
local_energy = NULL;
return SLURM_SUCCESS;
}
extern int acct_gather_energy_p_get_data(enum acct_energy_type data_type,
void *data)
{
int rc = SLURM_SUCCESS;
acct_gather_energy_t *energy = (acct_gather_energy_t *)data;
time_t *last_poll = (time_t *)data;
uint16_t *sensor_cnt = (uint16_t *)data;
xassert(running_in_slurmd_stepd());
if (!local_energy) {
debug("%s: trying to get data %d, but no local_energy yet.",
__func__, data_type);
acct_gather_energy_p_conf_set(0, NULL);
}
switch (data_type) {
case ENERGY_DATA_JOULES_TASK:
case ENERGY_DATA_NODE_ENERGY_UP:
if (local_energy->current_watts == NO_VAL)
energy->consumed_energy = NO_VAL64;
else
_get_joules_task(energy);
break;
case ENERGY_DATA_STRUCT:
case ENERGY_DATA_NODE_ENERGY:
memcpy(energy, local_energy, sizeof(acct_gather_energy_t));
break;
case ENERGY_DATA_LAST_POLL:
*last_poll = local_energy->poll_time;
break;
case ENERGY_DATA_SENSOR_CNT:
*sensor_cnt = 1;
break;
default:
error("acct_gather_energy_p_get_data: unknown enum %d",
data_type);
rc = SLURM_ERROR;
break;
}
return rc;
}
extern int acct_gather_energy_p_set_data(enum acct_energy_type data_type,
void *data)
{
int rc = SLURM_SUCCESS;
xassert(running_in_slurmd_stepd());
switch (data_type) {
case ENERGY_DATA_RECONFIG:
break;
case ENERGY_DATA_PROFILE:
_get_joules_task(local_energy);
_send_profile();
break;
case ENERGY_DATA_STEP_PTR:
/* set global job if needed later */
job = (stepd_step_rec_t *)data;
break;
default:
error("acct_gather_energy_p_set_data: unknown enum %d",
data_type);
rc = SLURM_ERROR;
break;
}
return rc;
}
extern void acct_gather_energy_p_conf_options(s_p_options_t **full_options,
int *full_options_cnt)
{
return;
}
extern void acct_gather_energy_p_conf_set(int context_id_in,
s_p_hashtbl_t *tbl)
{
int i;
uint64_t result;
if (!running_in_slurmd_stepd())
return;
/* Already been here, we shouldn't need to visit again */
if (local_energy)
return;
_hardware();
for (i = 0; i < nb_pkg; i++)
pkg_fd[i] = _open_msr(pkg2cpu[i]);
local_energy = acct_gather_energy_alloc(1);
result = _read_msr(pkg_fd[0], MSR_RAPL_POWER_UNIT);
if (result == 0)
local_energy->current_watts = NO_VAL;
debug("%s loaded", plugin_name);
return;
}
extern void acct_gather_energy_p_conf_values(List *data)
{
return;
}