/*****************************************************************************\
* acct_gather_energy_rsmi.c - slurm energy accounting plugin for AMD GPU.
*****************************************************************************
* Copyright (C) 2019 SchedMD LLC
* Copyright (c) 2019, Advanced Micro Devices, Inc. All rights reserved.
* Written by Advanced Micro Devices,
* who borrowed from the ipmi plugin of the same type
*
* This file is part of Slurm, a resource management program.
* For details, see .
* Please also read the included file: DISCLAIMER.
*
* Slurm is free software; you can redistribute it and/or modify it under
* the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* In addition, as a special exception, the copyright holders give permission
* to link the code of portions of this program with the OpenSSL library under
* certain conditions as described in each individual source file, and
* distribute linked combinations including the two. You must obey the GNU
* General Public License in all respects for all of the code used other than
* OpenSSL. If you modify file(s) with this exception, you may extend this
* exception to your version of the file(s), but you are not obligated to do
* so. If you do not wish to do so, delete this exception statement from your
* version. If you delete this exception statement from all source files in
* the program, then also delete it here.
*
* Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along
* with Slurm; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
\*****************************************************************************/
/* acct_gather_energy_rsmi
* This plugin initiates a node-level thread, which is running in Slurmd daemon
* and reads periodically the current average energy from the AMD GPUs through
* RSMI interface. It collects the energy consumption for all AMD GPUs for a
* node. It is also running in Slurmstepd daemon and collects the energy
* consumption for a job.
*/
#include
#include
#include "src/common/slurm_xlator.h"
#include "src/common/slurm_acct_gather_energy.h"
#include "src/common/slurm_acct_gather_profile.h"
#include "src/common/gres.h"
#define DEFAULT_RSMI_TIMEOUT 10
#define DEFAULT_RSMI_FREQ 30
/*
* These variables are required by the generic plugin interface. If they
* are not found in the plugin, the plugin loader will ignore it.
*
* plugin_name - a string giving a human-readable description of the
* plugin. There is no maximum length, but the symbol must refer to
* a valid string.
*
* plugin_type - a string suggesting the type of the plugin or its
* applicability to a particular form of data or method of data handling.
* If the low-level plugin API is used, the contents of this string are
* unimportant and may be anything. Slurm uses the higher-level plugin
* interface which requires this string to be of the form
*
* /
*
* where is a description of the intended application of
* the plugin (e.g., "jobacct" for Slurm job completion logging) and
* is a description of how this plugin satisfies that application. Slurm will
* only load job completion logging plugins if the plugin_type string has a
* prefix of "jobacct/".
*
* plugin_version - an unsigned 32-bit integer containing the Slurm version
* (major.minor.micro combined into a single number).
*/
const char plugin_name[] = "AcctGatherEnergy rsmi plugin";
const char plugin_type[] = "acct_gather_energy/rsmi";
const uint32_t plugin_version = SLURM_VERSION_NUMBER;
// array of struct to track the status of a GPU
typedef struct {
uint32_t last_update_watt;
time_t last_update_time;
time_t previous_update_time;
acct_gather_energy_t energy;
} gpu_status_t;
/*
* internal variables
*/
static int context_id = -1;
// copy of usable gpus and is only used by stepd for a job
static bitstr_t *saved_usable_gpus = NULL;
static gpu_status_t *gpus = NULL;
static uint16_t gpus_len = 0;
static uint64_t *start_current_energies = NULL;
static int dataset_id = -1; // id of the dataset for profile data
static uint64_t debug_flags = 0;
static bool flag_energy_accounting_shutdown = false;
static bool flag_thread_started = false;
static pthread_mutex_t rsmi_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t rsmi_cond = PTHREAD_COND_INITIALIZER;
static pthread_mutex_t launch_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_cond_t launch_cond = PTHREAD_COND_INITIALIZER;
static stepd_step_rec_t *job = NULL;
pthread_t thread_rsmi_id_launcher = 0;
pthread_t thread_rsmi_id_run = 0;
/*
* Check running profile
*/
static int _running_profile(void)
{
static bool run = false;
static uint32_t profile_opt = ACCT_GATHER_PROFILE_NOT_SET;
if (profile_opt == ACCT_GATHER_PROFILE_NOT_SET) {
acct_gather_profile_g_get(ACCT_GATHER_PROFILE_RUNNING,
&profile_opt);
if (profile_opt & ACCT_GATHER_PROFILE_ENERGY)
run = true;
}
return run;
}
/*
* Send profile
*/
static int _send_profile(void)
{
uint16_t i;
uint64_t data[gpus_len];
time_t last_time = gpus[gpus_len - 1].last_update_time;
if (!_running_profile())
return SLURM_SUCCESS;
if (dataset_id < 0) {
acct_gather_profile_dataset_t dataset[gpus_len + 1];
for (i = 0; i < gpus_len; i++) {
dataset[i].name = xstrdup_printf("GPU%dPower", i);
dataset[i].type = PROFILE_FIELD_UINT64;
}
dataset[i].name = NULL;
dataset[i].type = PROFILE_FIELD_NOT_SET;
dataset_id = acct_gather_profile_g_create_dataset(
"Energy", NO_PARENT, dataset);
for (i = 0; i < gpus_len; i++)
xfree(dataset[i].name);
if (debug_flags & DEBUG_FLAG_ENERGY)
debug("Energy: dataset created (id = %d)", dataset_id);
if (dataset_id == SLURM_ERROR) {
error("Energy: Failed to create the dataset");
return SLURM_ERROR;
}
}
/* pack an array of uint64_t with current power of gpus */
memset(data, 0, sizeof(data));
for (i = 0; i < gpus_len; i++) {
data[i] = gpus[i].energy.current_watts;
last_time = gpus[i].energy.poll_time;
}
if (debug_flags & DEBUG_FLAG_PROFILE) {
for (i = 0; i < gpus_len; i++) {
info("PROFILE-Energy: GPU%dPower=%"PRIu64"",
i, data[i]);
}
}
return acct_gather_profile_g_add_sample_data(dataset_id, (void *)data,
last_time);
}
/*
* _read_rsmi_value read current average watts and update last_update_watt
*
* dv_ind (IN) The device index
* energy (IN) A pointer to gpu_status_t structure
*/
static int _read_rsmi_value(uint32_t dv_ind, gpu_status_t *gpu)
{
const char *status_string;
uint64_t curr_milli_watts;
rsmi_status_t rsmi_rc = rsmi_dev_power_ave_get(
dv_ind, 0, &curr_milli_watts);
if (rsmi_rc != RSMI_STATUS_SUCCESS) {
rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
error("RSMI: Failed to get power: %s", status_string);
gpu->energy.current_watts = NO_VAL;
return SLURM_ERROR;
}
gpu->last_update_watt = curr_milli_watts/1000000;
gpu->previous_update_time = gpu->last_update_time;
gpu->last_update_time = time(NULL);
return SLURM_SUCCESS;
}
/*
* _get_additional_consumption computes consumption between 2 times
* time0 (IN) Previous time
* time1 (IN) Current time
* watt0 (IN) Previous watts
* watt1 (IN) Current watts
*/
static uint64_t _get_additional_consumption(time_t time0, time_t time1,
uint32_t watt0, uint32_t watt1)
{
return (uint64_t) ((time1 - time0)*(watt1 + watt0)/2);
}
/* updates the given energy according to the last watts reading of the gpu
* gpu (IN/OUT) A pointer to gpu_status_t structure
* readings (IN) readings to calculate average watts
*/
static void _update_energy(gpu_status_t *gpu, uint32_t readings)
{
uint32_t prev_watts;
acct_gather_energy_t *e = &gpu->energy;
if (e->current_watts && (e->current_watts != NO_VAL)) {
prev_watts = e->current_watts;
e->ave_watts = ((e->ave_watts * readings) +
e->current_watts) / (readings + 1);
e->current_watts = gpu->last_update_watt;
if (gpu->previous_update_time == 0)
e->base_consumed_energy = 0;
else
e->base_consumed_energy =
_get_additional_consumption(
gpu->previous_update_time,
gpu->last_update_time,
prev_watts,
e->current_watts);
e->previous_consumed_energy = e->consumed_energy;
e->consumed_energy += e->base_consumed_energy;
} else {
e->consumed_energy = 0;
e->ave_watts = 0;
e->current_watts = gpu->last_update_watt;
}
e->poll_time = time(NULL);
}
/*
* _thread_update_node_energy calls _read_rsmi_values and updates all values
* for node consumption
*/
static int _thread_update_node_energy(void)
{
int rc = SLURM_SUCCESS;
uint16_t i;
static uint32_t readings = 0;
for (i = 0; i < gpus_len; i++) {
rc = _read_rsmi_value(i, &gpus[i]);
if (rc == SLURM_SUCCESS) {
_update_energy(&gpus[i], readings);
}
}
readings++;
if (debug_flags & DEBUG_FLAG_ENERGY) {
for (i = 0; i < gpus_len; i++)
info("rsmi-thread: gpu %u current_watts: %u, consumed %"PRIu64" Joules %"PRIu64" new, ave watts %u",
i,
gpus[i].energy.current_watts,
gpus[i].energy.consumed_energy,
gpus[i].energy.base_consumed_energy,
gpus[i].energy.ave_watts);
}
return rc;
}
/* Get the total # of GPUs in the system
*
* device_count (OUT) Number of available GPU devices
*/
static void _rsmi_get_device_count(unsigned int *device_count)
{
const char *status_string;
rsmi_status_t rsmi_rc = rsmi_num_monitor_devices(device_count);
if (rsmi_rc != RSMI_STATUS_SUCCESS) {
rsmi_rc = rsmi_status_string(rsmi_rc, &status_string);
error("RSMI: Failed to get device count: %s", status_string);
*device_count = 0;
}
}
/*
* _thread_init initializes values and conf for the rsmi thread
*/
static int _thread_init(void)
{
if (gpus_len && gpus) {
if (debug_flags & DEBUG_FLAG_ENERGY)
info("%s thread init", plugin_name);
return SLURM_SUCCESS;
} else {
error("%s thread init failed, no GPU available", plugin_name);
return SLURM_ERROR;
}
}
/*
* _thread_rsmi_run is the thread calling rsmi periodically
* and read the energy values from the AMD GPUs
*/
static void *_thread_rsmi_run(void *no_data)
{
struct timeval tvnow;
struct timespec abs;
flag_energy_accounting_shutdown = false;
if (debug_flags & DEBUG_FLAG_ENERGY)
info("rsmi-thread: launched");
(void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
(void) pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
slurm_mutex_lock(&rsmi_mutex);
if (_thread_init() != SLURM_SUCCESS) {
if (debug_flags & DEBUG_FLAG_ENERGY)
info("rsmi-thread: aborted");
slurm_mutex_unlock(&rsmi_mutex);
slurm_mutex_lock(&launch_mutex);
slurm_cond_signal(&launch_cond);
slurm_mutex_unlock(&launch_mutex);
return NULL;
}
(void) pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, NULL);
slurm_mutex_unlock(&rsmi_mutex);
flag_thread_started = true;
slurm_mutex_lock(&launch_mutex);
slurm_cond_signal(&launch_cond);
slurm_mutex_unlock(&launch_mutex);
/* setup timer */
gettimeofday(&tvnow, NULL);
abs.tv_sec = tvnow.tv_sec;
abs.tv_nsec = tvnow.tv_usec * 1000;
//loop until slurm stop
while (!flag_energy_accounting_shutdown) {
slurm_mutex_lock(&rsmi_mutex);
_thread_update_node_energy();
/* Sleep until the next time. */
abs.tv_sec += DEFAULT_RSMI_FREQ;
slurm_cond_timedwait(&rsmi_cond, &rsmi_mutex, &abs);
slurm_mutex_unlock(&rsmi_mutex);
}
if (debug_flags & DEBUG_FLAG_ENERGY)
info("rsmi-thread: ended");
return NULL;
}
/*
* _thread_launcher is the thread that launches rsmi thread
*/
static void *_thread_launcher(void *no_data)
{
struct timeval tvnow;
struct timespec abs;
slurm_thread_create(&thread_rsmi_id_run, _thread_rsmi_run, NULL);
/* setup timer */
gettimeofday(&tvnow, NULL);
abs.tv_sec = tvnow.tv_sec + DEFAULT_RSMI_TIMEOUT;
abs.tv_nsec = tvnow.tv_usec * 1000;
slurm_mutex_lock(&launch_mutex);
slurm_cond_timedwait(&launch_cond, &launch_mutex, &abs);
slurm_mutex_unlock(&launch_mutex);
if (!flag_thread_started) {
error("%s threads failed to start in a timely manner",
plugin_name);
flag_energy_accounting_shutdown = true;
/*
* It is a known thing we can hang up on RSMI calls cancel if
* we must.
*/
pthread_cancel(thread_rsmi_id_run);
/*
* Unlock just to make sure since we could have canceled the
* thread while in the lock.
*/
slurm_mutex_unlock(&rsmi_mutex);
}
return NULL;
}
static void _add_energy(acct_gather_energy_t *energy_tot,
acct_gather_energy_t *energy_new,
int gpu_num)
{
if (energy_new->current_watts == NO_VAL)
return;
energy_tot->base_consumed_energy += energy_new->base_consumed_energy;
energy_tot->ave_watts += energy_new->ave_watts;
energy_tot->consumed_energy += energy_new->consumed_energy;
energy_tot->current_watts += energy_new->current_watts;
energy_tot->previous_consumed_energy +=
energy_new->previous_consumed_energy;
/*
* node poll_time is computed as the oldest poll_time of
* the gpus
*/
if (!energy_tot->poll_time ||
(energy_tot->poll_time > energy_new->poll_time))
energy_tot->poll_time = energy_new->poll_time;
if (debug_flags & DEBUG_FLAG_ENERGY)
info("%s: gpu: %d, current_watts: %u, consumed %"PRIu64" Joules %"PRIu64" new, ave watts %u",
__func__,
gpu_num,
energy_new->current_watts,
energy_new->consumed_energy,
energy_new->base_consumed_energy,
energy_new->ave_watts);
}
/* Get the energy for a job
* energy (IN) a pointer to a acct_gather_energy_t structure
*/
static void _get_node_energy_up(acct_gather_energy_t *energy)
{
slurm_cgroup_conf_t *cg_conf;
bool task_cgroup = false;
bool constrained_devices = false;
bool cgroups_active = false;
char *task_plugin_type = NULL;
uint16_t i;
// Check if GPUs are constrained by cgroups
slurm_mutex_lock(&xcgroup_config_read_mutex);
cg_conf = xcgroup_get_slurm_cgroup_conf();
if (cg_conf && cg_conf->constrain_devices)
constrained_devices = true;
slurm_mutex_unlock(&xcgroup_config_read_mutex);
// Check if task/cgroup plugin is loaded
task_plugin_type = slurm_get_task_plugin();
if (xstrstr(task_plugin_type, "cgroup"))
task_cgroup = true;
xfree(task_plugin_type);
// If both of these are true, then GPUs will be constrained
if (constrained_devices && task_cgroup) {
cgroups_active = true;
if (debug_flags & DEBUG_FLAG_ENERGY)
debug2("%s: cgroups are configured.", __func__);
} else {
if (debug_flags & DEBUG_FLAG_ENERGY)
debug2("%s: cgroups are NOT configured.", __func__);
}
// sum the energy of all gpus for this job
memset(energy, 0, sizeof(acct_gather_energy_t));
for (i = 0; i < gpus_len; i++) {
// Skip if not using cgroups, or bit is not set
if (cgroups_active && !bit_test(saved_usable_gpus, i)) {
if (debug_flags & DEBUG_FLAG_ENERGY)
debug2("Passing over gpu %u", i);
continue;
}
_add_energy(energy, &gpus[i].energy, i);
}
if (debug_flags & DEBUG_FLAG_ENERGY)
info("%s: current_watts: %u, consumed %"PRIu64" Joules %"PRIu64" new, ave watts %u",
__func__,
energy->current_watts,
energy->consumed_energy,
energy->base_consumed_energy,
energy->ave_watts);
}
/* Get the energy for a node
* energy (IN) a pointer to a acct_gather_energy_t structure
*/
static void _get_node_energy(acct_gather_energy_t *energy)
{
uint16_t i;
// sum the energy of all gpus for this node
memset(energy, 0, sizeof(acct_gather_energy_t));
for (i = 0; i < gpus_len; i++)
_add_energy(energy, &gpus[i].energy, i);
if (debug_flags & DEBUG_FLAG_ENERGY)
info("%s: current_watts: %u, consumed %"PRIu64" Joules %"PRIu64" new, ave watts %u",
__func__,
energy->current_watts,
energy->consumed_energy,
energy->base_consumed_energy,
energy->ave_watts);
}
/* Get the energy in joules for a job
* delta (IN) Use cache if data is newer than this in seconds
*/
static int _get_joules_task(uint16_t delta)
{
time_t now = time(NULL);
static bool stepd_first = true;
uint64_t adjustment = 0;
uint16_t i;
acct_gather_energy_t *new, *old;
/* gpus list */
acct_gather_energy_t *energies = NULL;
uint16_t gpu_cnt = 0;
xassert(context_id != -1);
if (slurm_get_node_energy(
NULL, context_id, delta, &gpu_cnt, &energies)) {
error("%s: can't get info from slurmd", __func__);
return SLURM_ERROR;
}
if (stepd_first) {
gpus_len = gpu_cnt;
gpus = xcalloc(sizeof(gpu_status_t), gpus_len);
start_current_energies = xcalloc(sizeof(uint64_t), gpus_len);
}
if (gpu_cnt != gpus_len) {
error("%s: received %u sensors, %u expected",
__func__, gpu_cnt, gpus_len);
acct_gather_energy_destroy(energies);
return SLURM_ERROR;
}
for (i = 0; i < gpu_cnt; i++) {
new = &energies[i];
old = &gpus[i].energy;
new->previous_consumed_energy = old->consumed_energy;
adjustment = _get_additional_consumption(
new->poll_time, now,
new->current_watts,
new->current_watts);
if (!stepd_first) {
new->consumed_energy -= start_current_energies[i];
new->base_consumed_energy = adjustment +
(new->consumed_energy - old->consumed_energy);
} else {
/*
* This is just for the step, so take all the pervious
* consumption out of the mix.
*/
start_current_energies[i] =
new->consumed_energy + adjustment;
new->base_consumed_energy = 0;
}
new->consumed_energy = new->previous_consumed_energy
+ new->base_consumed_energy;
memcpy(old, new, sizeof(acct_gather_energy_t));
if (debug_flags & DEBUG_FLAG_ENERGY)
info("%s: consumed %"PRIu64" Joules (received %"PRIu64"(%u watts) from slurmd)",
__func__,
new->consumed_energy,
new->base_consumed_energy,
new->current_watts);
}
acct_gather_energy_destroy(energies);
stepd_first = false;
return SLURM_SUCCESS;
}
/*
* init() is called when the plugin is loaded, before any other functions
* are called. Put global initialization here.
*/
extern int init(void)
{
if (!dlopen("librocm_smi64.so", RTLD_NOW | RTLD_GLOBAL))
fatal("RSMI configured, but wasn't found.");
debug_flags = slurm_get_debug_flags();
/* put anything that requires the .conf being read in
acct_gather_energy_p_conf_parse
*/
rsmi_init(0);
return SLURM_SUCCESS;
}
/*
* fini() is called when the plugin exits.
*/
extern int fini(void)
{
if (!running_in_slurmdstepd())
return SLURM_SUCCESS;
flag_energy_accounting_shutdown = true;
slurm_mutex_lock(&launch_mutex);
/* clean up the launch thread */
slurm_cond_signal(&launch_cond);
slurm_mutex_unlock(&launch_mutex);
if (thread_rsmi_id_launcher)
pthread_join(thread_rsmi_id_launcher, NULL);
slurm_mutex_lock(&rsmi_mutex);
/* clean up the run thread */
slurm_cond_signal(&rsmi_cond);
slurm_mutex_unlock(&rsmi_mutex);
if (thread_rsmi_id_run)
pthread_join(thread_rsmi_id_run, NULL);
xfree(gpus);
xfree(start_current_energies);
FREE_NULL_BITMAP(saved_usable_gpus);
rsmi_shut_down();
return SLURM_SUCCESS;
}
extern int acct_gather_energy_p_update_node_energy(void)
{
int rc = SLURM_SUCCESS;
xassert(running_in_slurmdstepd());
return rc;
}
extern int acct_gather_energy_p_get_data(enum acct_energy_type data_type,
void *data)
{
uint16_t i;
int rc = SLURM_SUCCESS;
acct_gather_energy_t *energy = (acct_gather_energy_t *)data;
time_t *last_poll = (time_t *)data;
uint16_t *gpu_cnt = (uint16_t *)data;
xassert(running_in_slurmdstepd());
switch (data_type) {
case ENERGY_DATA_NODE_ENERGY_UP:
slurm_mutex_lock(&rsmi_mutex);
if (running_in_slurmd()) {
if (_thread_init() == SLURM_SUCCESS) {
_thread_update_node_energy();
_get_node_energy(energy);
}
} else {
_get_joules_task(10);
_get_node_energy_up(energy);
}
slurm_mutex_unlock(&rsmi_mutex);
break;
case ENERGY_DATA_NODE_ENERGY:
slurm_mutex_lock(&rsmi_mutex);
_get_node_energy(energy);
slurm_mutex_unlock(&rsmi_mutex);
break;
case ENERGY_DATA_LAST_POLL:
slurm_mutex_lock(&rsmi_mutex);
*last_poll = gpus[gpus_len-1].last_update_time;
slurm_mutex_unlock(&rsmi_mutex);
break;
case ENERGY_DATA_SENSOR_CNT:
slurm_mutex_lock(&rsmi_mutex);
*gpu_cnt = gpus_len;
slurm_mutex_unlock(&rsmi_mutex);
break;
case ENERGY_DATA_STRUCT:
slurm_mutex_lock(&rsmi_mutex);
for (i = 0; i < gpus_len; i++)
memcpy(&energy[i], &gpus[i].energy,
sizeof(acct_gather_energy_t));
slurm_mutex_unlock(&rsmi_mutex);
break;
case ENERGY_DATA_JOULES_TASK:
slurm_mutex_lock(&rsmi_mutex);
if (running_in_slurmd()) {
if (_thread_init() == SLURM_SUCCESS)
_thread_update_node_energy();
} else {
_get_joules_task(10);
}
for (i = 0; i < gpus_len; ++i)
memcpy(&energy[i], &gpus[i].energy,
sizeof(acct_gather_energy_t));
slurm_mutex_unlock(&rsmi_mutex);
break;
default:
error("%s: unknown enum %d",
__func__, data_type);
rc = SLURM_ERROR;
break;
}
return rc;
}
extern int acct_gather_energy_p_set_data(enum acct_energy_type data_type,
void *data)
{
int rc = SLURM_SUCCESS;
int *delta = (int *)data;
xassert(running_in_slurmdstepd());
switch (data_type) {
case ENERGY_DATA_RECONFIG:
debug_flags = slurm_get_debug_flags();
break;
case ENERGY_DATA_PROFILE:
slurm_mutex_lock(&rsmi_mutex);
_get_joules_task(*delta);
_send_profile();
slurm_mutex_unlock(&rsmi_mutex);
break;
case ENERGY_DATA_STEP_PTR:
{
bitstr_t *usable_gpus = NULL;
/* set global job if needed later */
job = (stepd_step_rec_t *)data;
rc = gres_get_step_info(job->step_gres_list, "gpu", 0,
GRES_STEP_DATA_BITMAP,
&usable_gpus);
if (rc == SLURM_SUCCESS) {
/*
* Save a copy of the GPUs affected, so we can
* reset things afterwards
*/
FREE_NULL_BITMAP(saved_usable_gpus);
saved_usable_gpus = usable_gpus;
usable_gpus = NULL;
}
if (debug_flags & DEBUG_FLAG_ENERGY)
info("usable_gpus = %d of %ld",
bit_set_count(saved_usable_gpus),
bit_size(saved_usable_gpus));
break;
}
default:
error("%s: unknown enum %d",
__func__, data_type);
rc = SLURM_ERROR;
break;
}
return rc;
}
extern void acct_gather_energy_p_conf_options(s_p_options_t **full_options,
int *full_options_cnt)
{
return;
}
extern void acct_gather_energy_p_conf_set(int context_id_in,
s_p_hashtbl_t *tbl)
{
static bool flag_init = false;
context_id = context_id_in;
if (!running_in_slurmdstepd())
return;
if (!flag_init) {
flag_init = true;
if (running_in_slurmd()) {
_rsmi_get_device_count((unsigned int *)&gpus_len);
if (gpus_len) {
gpus = xcalloc(sizeof(gpu_status_t), gpus_len);
slurm_thread_create(&thread_rsmi_id_launcher,
_thread_launcher, NULL);
}
if (debug_flags & DEBUG_FLAG_ENERGY)
info("%s thread launched", plugin_name);
} else
_get_joules_task(0);
}
debug("%s loaded", plugin_name);
return;
}
extern void acct_gather_energy_p_conf_values(List *data)
{
return;
}