/*****************************************************************************\ * gpu_rsmi.c - Support rsmi interface to an AMD GPU. ***************************************************************************** * Copyright (C) 2019 SchedMD LLC * Copyright (c) 2019, Advanced Micro Devices, Inc. All rights reserved. * Written by Advanced Micro Devices, * who borrowed heavily from SLURM gpu and nvml plugin. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #define _GNU_SOURCE #include "src/common/slurm_xlator.h" #include "src/common/gres.h" #include "src/common/log.h" #include /* * #defines needed to test rsmi. */ #define FREQS_CONCISE 5 // This must never be smaller than 5, or error #define GPU_LOW ((unsigned int) -1) #define GPU_MEDIUM ((unsigned int) -2) #define GPU_HIGH_M1 ((unsigned int) -3) #define GPU_HIGH ((unsigned int) -4) static bitstr_t *saved_gpus; /* * Buffer size large enough for RSMI string */ #define RSMI_STRING_BUFFER_SIZE 80 /* * PCI information about a GPU device. */ typedef struct rsmiPciInfo_st { union { struct { #ifdef SLURM_BIGENDIAN uint64_t reserved : 35; uint64_t domain : 16; uint64_t bus : 5; uint64_t device : 5; uint64_t function : 3; #else uint64_t function : 3; uint64_t device : 5; uint64_t bus : 5; uint64_t domain : 16; uint64_t reserved : 35; #endif }; uint64_t bdfid; }; } rsmiPciInfo_t; /* * These variables are required by the generic plugin interface. If they * are not found in the plugin, the plugin loader will ignore it. * * plugin_name - A string giving a human-readable description of the * plugin. There is no maximum length, but the symbol must refer to * a valid string. * * plugin_type - A string suggesting the type of the plugin or its * applicability to a particular form of data or method of data handling. * If the low-level plugin API is used, the contents of this string are * unimportant and may be anything. Slurm uses the higher-level plugin * interface which requires this string to be of the form * * / * * where is a description of the intended application of * the plugin (e.g., "auth" for Slurm authentication) and is a * description of how this plugin satisfies that application. Slurm will * only load authentication plugins if the plugin_type string has a prefix * of "auth/". * * plugin_version - an unsigned 32-bit integer containing the Slurm version * (major.minor.micro combined into a single number). */ const char *plugin_name = "GPU RSMI plugin"; const char *plugin_type = "gpu/rsmi"; const uint32_t plugin_version = SLURM_VERSION_NUMBER; static log_level_t log_lvl = LOG_LEVEL_DEBUG5; extern int init(void) { debug("%s: %s loaded", __func__, plugin_name); if (slurm_get_debug_flags() & DEBUG_FLAG_GRES) log_lvl = LOG_LEVEL_INFO; return SLURM_SUCCESS; } extern int fini(void) { debug("%s: unloading %s", __func__, plugin_name); return SLURM_SUCCESS; } //TODO: Duplicated from NVML plugin. Move to a common directory static unsigned int _xlate_freq_value(char *gpu_freq) { unsigned int value; if (!gpu_freq && (gpu_freq[0] < '0') && (gpu_freq[0] > '9')) return 0; /* Not a numeric value */ value = strtoul(gpu_freq, NULL, 10); return value; } //TODO: Duplicated from NVML plugin. Move to a common directory static unsigned int _xlate_freq_code(char *gpu_freq) { //TODO: To be moved to common directory if (!gpu_freq || !gpu_freq[0]) return 0; if ((gpu_freq[0] >= '0') && (gpu_freq[0] <= '9')) return 0; /* Pure numeric value */ if (!strcasecmp(gpu_freq, "low")) return GPU_LOW; else if (!strcasecmp(gpu_freq, "medium")) return GPU_MEDIUM; else if (!strcasecmp(gpu_freq, "highm1")) return GPU_HIGH_M1; else if (!strcasecmp(gpu_freq, "high")) return GPU_HIGH; debug("%s: %s: Invalid job GPU frequency (%s)", plugin_type, __func__, gpu_freq); return 0; /* Bad user input */ } //TODO: Duplicated from NVML plugin. Move to a common directory static void _parse_gpu_freq2(char *gpu_freq, unsigned int *gpu_freq_code, unsigned int *gpu_freq_value, unsigned int *mem_freq_code, unsigned int *mem_freq_value, bool *verbose_flag) { char *tmp, *tok, *sep, *save_ptr = NULL; if (!gpu_freq || !gpu_freq[0]) return; tmp = xstrdup(gpu_freq); tok = strtok_r(tmp, ",", &save_ptr); while (tok) { sep = strchr(tok, '='); if (sep) { sep[0] = '\0'; sep++; if (!strcasecmp(tok, "memory")) { *mem_freq_code = _xlate_freq_code(sep); *mem_freq_value = _xlate_freq_value(sep); if (!(*mem_freq_code) && !(*mem_freq_value)) { debug("Invalid job GPU memory frequency: %s", tok); } } else { debug("%s: %s: Invalid job device frequency type: %s", plugin_type, __func__, tok); } } else if (!strcasecmp(tok, "verbose")) { *verbose_flag = true; } else { *gpu_freq_code = _xlate_freq_code(tok); *gpu_freq_value = _xlate_freq_value(tok); if (!(*gpu_freq_code) && !(*gpu_freq_value)) debug("Invalid job GPU frequency: %s", tok); } tok = strtok_r(NULL, ",", &save_ptr); } xfree(tmp); } //TODO: Duplicated from NVML plugin. Move to a common directory static void _parse_gpu_freq(char *gpu_freq, unsigned int *gpu_freq_num, unsigned int *mem_freq_num, bool *verbose_flag) { unsigned int def_gpu_freq_code = 0, def_gpu_freq_value = 0; unsigned int def_mem_freq_code = 0, def_mem_freq_value = 0; unsigned int job_gpu_freq_code = 0, job_gpu_freq_value = 0; unsigned int job_mem_freq_code = 0, job_mem_freq_value = 0; char *def_freq; _parse_gpu_freq2(gpu_freq, &job_gpu_freq_code, &job_gpu_freq_value, &job_mem_freq_code, &job_mem_freq_value, verbose_flag); // Defaults to high for both mem and gfx def_freq = slurm_get_gpu_freq_def(); _parse_gpu_freq2(def_freq, &def_gpu_freq_code, &def_gpu_freq_value, &def_mem_freq_code, &def_mem_freq_value, verbose_flag); xfree(def_freq); if (job_gpu_freq_code) *gpu_freq_num = job_gpu_freq_code; else if (job_gpu_freq_value) *gpu_freq_num = job_gpu_freq_value; else if (def_gpu_freq_code) *gpu_freq_num = def_gpu_freq_code; else if (def_gpu_freq_value) *gpu_freq_num = def_gpu_freq_value; if (job_mem_freq_code) *mem_freq_num = job_mem_freq_code; else if (job_mem_freq_value) *mem_freq_num = job_mem_freq_value; else if (def_mem_freq_code) *mem_freq_num = def_mem_freq_code; else if (def_mem_freq_value) *mem_freq_num = def_mem_freq_value; } //TODO: Duplicated from NVML plugin. Move to a common directory static int _sort_freq_descending(const void *a, const void *b) { return (*(unsigned long *)b - *(unsigned long *)a); } /* * Get all possible memory frequencies for the device * * dv_ind (IN) The device index * mem_freqs_size (IN/OUT) The size of the mem_freqs array; this will be * overwritten with the number of memory freqs found. * mem_freqs (OUT) The possible memory frequencies in MHz. * * Return true if successful, false if not. */ static bool _rsmi_get_mem_freqs(uint32_t dv_ind, unsigned int *mem_freqs_size, unsigned int *mem_freqs) { const char *status_string; rsmi_status_t rsmi_rc; rsmi_frequencies_t rsmi_freqs; DEF_TIMERS; START_TIMER; rsmi_rc = rsmi_dev_gpu_clk_freq_get( dv_ind, RSMI_CLK_TYPE_MEM, &rsmi_freqs); END_TIMER; debug3("rsmi_dev_gpu_clk_freq_get() took %ld microseconds", DELTA_TIMER); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get memory frequencies error: %s", status_string); return false; } *mem_freqs_size = rsmi_freqs.num_supported; for (int i = 0; i < *mem_freqs_size; i++) mem_freqs[i] = rsmi_freqs.frequency[i]/1000000; return true; } /* * Get all possible graphics frequencies for the device * * dv_ind (IN) The device index * gfx_freqs_size (IN/OUT) The size of the gfx_freqs array; this will * be overwritten with the number of graphics freqs found. * gfx_freqs (OUT) The possible graphics frequencies in MHz. * * Return true if successful, false if not. */ static bool _rsmi_get_gfx_freqs(uint32_t dv_ind, unsigned int *gfx_freqs_size, unsigned int *gfx_freqs) { const char *status_string; rsmi_status_t rsmi_rc; rsmi_frequencies_t rsmi_freqs; DEF_TIMERS; START_TIMER; rsmi_rc = rsmi_dev_gpu_clk_freq_get( dv_ind, RSMI_CLK_TYPE_SYS, &rsmi_freqs); END_TIMER; debug3("rsmi_dev_gpu_clk_freq_get() took %ld microseconds", DELTA_TIMER); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get graphics frequencies error: %s", status_string); return false; } *gfx_freqs_size = rsmi_freqs.num_supported; for (int i = 0; i < *gfx_freqs_size; i++) gfx_freqs[i] = rsmi_freqs.frequency[i]/1000000; return true; } /* * Print out all possible memory and graphics frequencies for the given device. * If there are more than FREQS_CONCISE frequencies, prints a summary instead * * dv_ind (IN) The device index * l (IN) The log level at which to print */ static void _rsmi_print_freqs(uint32_t dv_ind, log_level_t l) { unsigned int mem_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0}; unsigned int gfx_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0}; unsigned int size = RSMI_MAX_NUM_FREQUENCIES; bool concise = false; unsigned int i; if (!_rsmi_get_mem_freqs(dv_ind, &size, mem_freqs)) return; qsort(mem_freqs, size, sizeof(unsigned int), _sort_freq_descending); if ((size > 1) && (mem_freqs[0] <= mem_freqs[(size)-1])) { error("%s: memory frequencies are not stored in descending order!", __func__); return; } if (size > FREQS_CONCISE) concise = true; log_var(l, " Possible GPU Memory Frequencies (%u):", size); log_var(l, " ---------------------------------"); if (!concise) { for (i = 0; i < size; ++i) log_var(l, " *%u MHz [%u]", mem_freqs[i], i); } else { // first, next, ..., middle, ..., penultimate, last log_var(l, " *%u MHz [0]", mem_freqs[0]); log_var(l, " *%u MHz [1]", mem_freqs[1]); log_var(l, " ..."); log_var(l, " *%u MHz [%u]", mem_freqs[(size - 1) / 2], (size - 1) / 2); log_var(l, " ..."); log_var(l, " *%u MHz [%u]", mem_freqs[size - 2], size - 2); log_var(l, " *%u MHz [%u]", mem_freqs[size - 1], size - 1); } size = RSMI_MAX_NUM_FREQUENCIES; if (!_rsmi_get_gfx_freqs(dv_ind, &size, gfx_freqs)) return; qsort(gfx_freqs, size, sizeof(unsigned int), _sort_freq_descending); if ((size > 1) && (gfx_freqs[0] <= gfx_freqs[(size)-1])) { error("%s: Graphics frequencies are not stored in descending order!", __func__); return; } if (size > FREQS_CONCISE) concise = true; log_var(l, " Possible GPU Graphics Frequencies (%u):", size); log_var(l, " ---------------------------------"); if (!concise) { for (i = 0; i < size; ++i) log_var(l, " *%u MHz [%u]", gfx_freqs[i], i); return; } // first, next, ..., middle, ..., penultimate, last log_var(l, " *%u MHz [0]", gfx_freqs[0]); log_var(l, " *%u MHz [1]", gfx_freqs[1]); log_var(l, " ..."); log_var(l, " *%u MHz [%u]", gfx_freqs[(size - 1) / 2], (size - 1) / 2); log_var(l, " ..."); log_var(l, " *%u MHz [%u]", gfx_freqs[size - 2], size - 2); log_var(l, " *%u MHz [%u]", gfx_freqs[size - 1], size - 1); } /* * Convert frequency to nearest valid frequency found in frequency array * * freq (IN/OUT) The frequency to check, in MHz. Also the output, if * it needs to be changed. * freqs_size (IN) The size of the freqs array * freqs (IN) An array of frequency values in MHz, sorted highest to * lowest * * Inspired by src/common/cpu_frequency#_cpu_freq_freqspec_num() */ //TODO: Duplicated from NVML plugin. Move to a common directory static void _get_nearest_freq(unsigned int *freq, unsigned int freqs_size, unsigned int *freqs) { unsigned int i; if (!freq || !(*freq)) { log_var(log_lvl, "%s: No frequency supplied", __func__); return; } if (!freqs || !(*freqs)) { log_var(log_lvl, "%s: No frequency list supplied", __func__); return; } if (freqs_size <= 0) { log_var(log_lvl, "%s: Frequency list is empty", __func__); return; } // Check for special case values; freqs is sorted in descending order switch ((*freq)) { case GPU_LOW: *freq = freqs[freqs_size - 1]; debug2("Frequency GPU_LOW: %u MHz", *freq); return; case GPU_MEDIUM: *freq = freqs[(freqs_size - 1) / 2]; debug2("Frequency GPU_MEDIUM: %u MHz", *freq); return; case GPU_HIGH_M1: if (freqs_size == 1) *freq = freqs[0]; else *freq = freqs[1]; debug2("Frequency GPU_HIGH_M1: %u MHz", *freq); return; case GPU_HIGH: *freq = freqs[0]; debug2("Frequency GPU_HIGH: %u MHz", *freq); return; default: debug2("Freq is not a special case. Continue..."); break; } /* check if freq is out of bounds of freqs */ if (*freq > freqs[0]) { log_var(log_lvl, "Rounding frequency %u MHz down to %u MHz", *freq, freqs[0]); *freq = freqs[0]; return; } else if (*freq < freqs[freqs_size - 1]) { log_var(log_lvl, "Rounding frequency %u MHz up to %u MHz", *freq, freqs[freqs_size - 1]); *freq = freqs[freqs_size - 1]; return; } /* check for frequency, and round up if no exact match */ for (i = 0; i < freqs_size - 1;) { if (*freq == freqs[i]) // No change necessary debug2("No change necessary. Freq: %u MHz", *freq); return; i++; /* * Step down to next element to round up. * Safe to advance due to bounds checks above here */ if (*freq > freqs[i]) { log_var(log_lvl, "Rounding frequency %u MHz up to %u MHz", *freq, freqs[i - 1]); *freq = freqs[i - 1]; return; } } error("%s: Got to the end of the function. Freq: %u MHz", __func__, *freq); } /* * Get the nearest valid memory and graphics frequencies * Return bit masks indicating the indices of the * frequencies that are to be enabled (1) and disabled (0). * * dv_ind (IN) the device index * mem_freq (IN/OUT) requested/nearest valid memory frequency * mem_bitmask (OUT) bit mask for the nearest valid memory frequency * gfx_freq (IN/OUT) requested/nearest valid graphics frequency * gfx_bitmask (OUT) bit mask for the nearest valid graphics frequency */ static void _rsmi_get_nearest_freqs(uint32_t dv_ind, unsigned int *mem_freq, uint64_t *mem_bitmask, unsigned int *gfx_freq, uint64_t *gfx_bitmask) { unsigned int mem_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0}; unsigned int mem_freqs_sort[RSMI_MAX_NUM_FREQUENCIES] = {0}; unsigned int mem_freqs_size = RSMI_MAX_NUM_FREQUENCIES; unsigned int gfx_freqs[RSMI_MAX_NUM_FREQUENCIES] = {0}; unsigned int gfx_freqs_sort[RSMI_MAX_NUM_FREQUENCIES] = {0}; unsigned int gfx_freqs_size = RSMI_MAX_NUM_FREQUENCIES; // Get the memory frequencies if (!_rsmi_get_mem_freqs(dv_ind, &mem_freqs_size, mem_freqs)) return; memcpy(mem_freqs_sort, mem_freqs, mem_freqs_size*sizeof(unsigned int)); qsort(mem_freqs_sort, mem_freqs_size, sizeof(unsigned int), _sort_freq_descending); if ((mem_freqs_size > 1) && (mem_freqs_sort[0] <= mem_freqs_sort[(mem_freqs_size)-1])) { error("%s: memory frequencies are not stored in descending order!", __func__); return; } // Set the nearest valid memory frequency for the requested frequency _get_nearest_freq(mem_freq, mem_freqs_size, mem_freqs_sort); // convert the frequency to bit mask for (int i = 0; i < mem_freqs_size; i++) if (*mem_freq == mem_freqs[i]) { *mem_bitmask = (1 << i); break; } // Get the graphics frequencies if (!_rsmi_get_gfx_freqs(dv_ind, &gfx_freqs_size, gfx_freqs)) return; memcpy(gfx_freqs_sort, gfx_freqs, gfx_freqs_size*sizeof(unsigned int)); qsort(gfx_freqs_sort, gfx_freqs_size, sizeof(unsigned int), _sort_freq_descending); if ((gfx_freqs_size > 1) && (gfx_freqs_sort[0] <= gfx_freqs_sort[(gfx_freqs_size)-1])) { error("%s: graphics frequencies are not stored in descending order!", __func__); return; } // Set the nearest valid graphics frequency for the requested frequency _get_nearest_freq(gfx_freq, gfx_freqs_size, gfx_freqs_sort); // convert the frequency to bit mask for (int i = 0; i < gfx_freqs_size; i++) if (*gfx_freq == gfx_freqs[i]) { *gfx_bitmask = (1 << i); break; } } /* * Set the memory and graphics clock frequencies for the GPU * * dv_ind (IN) The device index * mem_bitmask (IN) bit mask for the memory frequency. * gfx_bitmask (IN) bit mask for the graphics frequency. * * Returns true if successful, false if not */ static bool _rsmi_set_freqs(uint32_t dv_ind, uint64_t mem_bitmask, uint64_t gfx_bitmask) { const char *status_string; rsmi_status_t rsmi_rc; DEF_TIMERS; START_TIMER; rsmi_rc = rsmi_dev_gpu_clk_freq_set( dv_ind, RSMI_CLK_TYPE_MEM, mem_bitmask); END_TIMER; debug3("rsmi_dev_gpu_clk_freq_set(0x%lx) for memory took %ld microseconds", mem_bitmask, DELTA_TIMER); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to set memory frequency GPU %u error: %s", dv_ind, status_string); return false; } START_TIMER; rsmi_rc = rsmi_dev_gpu_clk_freq_set(dv_ind, RSMI_CLK_TYPE_SYS, gfx_bitmask); debug3("rsmi_dev_gpu_clk_freq_set(0x%lx) for graphics took %ld microseconds", gfx_bitmask, DELTA_TIMER); END_TIMER; if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to set graphic frequency GPU %u error: %s", dv_ind, status_string); return false; } return true; } /* * Reset the memory and graphics clock frequencies for the GPU to the same * default frequencies that are used after system reboot or driver reload. This * default cannot be changed. * * dv_ind (IN) The device index * * Returns true if successful, false if not */ static bool _rsmi_reset_freqs(uint32_t dv_ind) { const char *status_string; rsmi_status_t rsmi_rc; DEF_TIMERS; START_TIMER; rsmi_rc = rsmi_dev_perf_level_set(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO); END_TIMER; debug3("rsmi_dev_perf_level_set() took %ld microseconds", DELTA_TIMER); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to reset frequencies error: %s", status_string); return false; } return true; } /* * Get the memory or graphics clock frequency that the GPU is currently running * at * * dv_ind (IN) The device index * type (IN) The clock type to query. Either RSMI_CLK_TYPE_SYS or * RSMI_CLK_TYPE_MEM. * * Returns the clock frequency in MHz if successful, or 0 if not */ static unsigned int _rsmi_get_freq(uint32_t dv_ind, rsmi_clk_type_t type) { const char *status_string; rsmi_status_t rsmi_rc; rsmi_frequencies_t rsmi_freqs; char *type_str = "unknown"; DEF_TIMERS; switch (type) { case RSMI_CLK_TYPE_SYS: type_str = "graphics"; break; case RSMI_CLK_TYPE_MEM: type_str = "memory"; break; default: error("%s: Unsupported clock type", __func__); break; } START_TIMER; rsmi_rc = rsmi_dev_gpu_clk_freq_get(dv_ind, type, &rsmi_freqs); END_TIMER; debug3("rsmi_dev_gpu_clk_freq_get(%s) took %ld microseconds", type_str, DELTA_TIMER); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get the GPU frequency type %s, error: %s", type_str, status_string); return 0; } return (rsmi_freqs.frequency[rsmi_freqs.current]/1000000); } static unsigned int _rsmi_get_gfx_freq(uint32_t dv_ind) { return _rsmi_get_freq(dv_ind, RSMI_CLK_TYPE_SYS); } static unsigned int _rsmi_get_mem_freq(uint32_t dv_ind) { return _rsmi_get_freq(dv_ind, RSMI_CLK_TYPE_MEM); } /* * Convert a frequency value to a string * Returned string must be xfree()'ed */ //TODO: Duplicated from NVML plugin. Move to a common directory static char *_freq_value_to_string(unsigned int freq) { switch (freq) { case GPU_LOW: return xstrdup("low"); case GPU_MEDIUM: return xstrdup("medium"); case GPU_HIGH: return xstrdup("high"); case GPU_HIGH_M1: return xstrdup("highm1"); default: return xstrdup_printf("%u", freq); } } /* * Reset the frequencies of each GPU in the step to the hardware default * NOTE: RSMI must be initialized beforehand * * gpus (IN) A bitmap specifying the GPUs on which to operate. */ static void _reset_freq(bitstr_t *gpus) { int gpu_len = bit_size(gpus); int i = -1, count = 0, count_set = 0; bool freq_reset = false; // Reset the frequency of each device allocated to the step for (i = 0; i < gpu_len; i++) { if (!bit_test(gpus, i)) continue; count++; debug2("Memory frequency before reset: %u", _rsmi_get_mem_freq(i)); debug2("Graphics frequency before reset: %u", _rsmi_get_gfx_freq(i)); freq_reset = _rsmi_reset_freqs(i); debug2("Memory frequency after reset: %u", _rsmi_get_mem_freq(i)); debug2("Graphics frequency after reset: %u", _rsmi_get_gfx_freq(i)); // TODO: Check to make sure that the frequency reset if (freq_reset) { log_var(log_lvl, "Successfully reset GPU[%d]", i); count_set++; } else { log_var(log_lvl, "Failed to reset GPU[%d]", i); } } if (count_set != count) { log_var(log_lvl, "%s: Could not reset frequencies for all GPUs %d/%d total GPUs", __func__, count_set, count); fprintf(stderr, "Could not reset frequencies for all GPUs %d/%d total GPUs\n", count_set, count); } } /* * Set the frequencies of each GPU specified for the step * NOTE: RSMI must be initialized beforehand * * gpus (IN) A bitmap specifying the GPUs on which to operate. * gpu_freq (IN) The frequencies to set each of the GPUs to. If a NULL or * empty memory or graphics frequency is specified, then GpuFreqDef * will be consulted, which defaults to "high,memory=high" if not * set. */ static void _set_freq(bitstr_t *gpus, char *gpu_freq) { bool verbose_flag = false; int gpu_len = 0; int i = -1, count = 0, count_set = 0; unsigned int gpu_freq_num = 0, mem_freq_num = 0; uint64_t mem_bitmask = 0, gpu_bitmask = 0; bool freq_set = false, freq_logged = false; char *tmp = NULL; slurm_cgroup_conf_t *cg_conf; bool task_cgroup = false; bool constrained_devices = false; bool cgroups_active = false; char *task_plugin_type = NULL; // Parse frequency information debug2("_parse_gpu_freq(%s)", gpu_freq); _parse_gpu_freq(gpu_freq, &gpu_freq_num, &mem_freq_num, &verbose_flag); if (verbose_flag) debug2("verbose_flag ON"); tmp = _freq_value_to_string(mem_freq_num); debug2("Requested GPU memory frequency: %s", tmp); xfree(tmp); tmp = _freq_value_to_string(gpu_freq_num); debug2("Requested GPU graphics frequency: %s", tmp); xfree(tmp); if (!mem_freq_num || !gpu_freq_num) { debug2("%s: No frequencies to set", __func__); return; } // Check if GPUs are constrained by cgroups slurm_mutex_lock(&xcgroup_config_read_mutex); cg_conf = xcgroup_get_slurm_cgroup_conf(); if (cg_conf && cg_conf->constrain_devices) constrained_devices = true; slurm_mutex_unlock(&xcgroup_config_read_mutex); // Check if task/cgroup plugin is loaded task_plugin_type = slurm_get_task_plugin(); if (strstr(task_plugin_type, "cgroup")) task_cgroup = true; xfree(task_plugin_type); // If both of these are true, then GPUs will be constrained if (constrained_devices && task_cgroup) { cgroups_active = true; gpu_len = bit_set_count(gpus); debug2("%s: cgroups are configured. Using LOCAL GPU IDs", __func__); } else { gpu_len = bit_size(gpus); debug2("%s: cgroups are NOT configured. Assuming GLOBAL GPU IDs", __func__); } // Set the frequency of each device allocated to the step for (i = 0; i < gpu_len; i++) { char *sep = ""; // Only check the global GPU bitstring if not using cgroups if (!cgroups_active && !bit_test(gpus, i)) { debug2("Passing over RSMI device %u", i); continue; } count++; debug2("Setting frequency of RSMI device %u", i); _rsmi_get_nearest_freqs(i, &mem_freq_num, &mem_bitmask, &gpu_freq_num, &gpu_bitmask); debug2("Memory frequency before set: %u", _rsmi_get_mem_freq(i)); debug2("Graphics frequency before set: %u", _rsmi_get_gfx_freq(i)); freq_set = _rsmi_set_freqs(i, mem_bitmask, gpu_bitmask); debug2("Memory frequency after set: %u", _rsmi_get_mem_freq(i)); debug2("Graphics frequency after set: %u", _rsmi_get_gfx_freq(i)); if (mem_freq_num) { xstrfmtcat(tmp, "%smemory_freq:%u", sep, mem_freq_num); sep = ","; } if (gpu_freq_num) { xstrfmtcat(tmp, "%sgraphics_freq:%u", sep, gpu_freq_num); } if (freq_set) { log_var(log_lvl, "Successfully set GPU[%d] %s", i, tmp); count_set++; } else { log_var(log_lvl, "Failed to set GPU[%d] %s", i, tmp); } if (verbose_flag && !freq_logged) { fprintf(stderr, "GpuFreq=%s\n", tmp); freq_logged = true; /* Just log for first GPU */ } xfree(tmp); } if (count_set != count) { log_var(log_lvl, "%s: Could not set frequencies for all GPUs %d/%d total GPUs", __func__, count_set, count); fprintf(stderr, "Could not set frequencies for all GPUs %d/%d total GPUs\n", count_set, count); } } /* * Get the version of the AMD Graphics driver * * driver (OUT) A string to return version of AMD GPU driver * len (OUT) Length for version of AMD GPU driver */ static void _rsmi_get_driver(char *driver, unsigned int len) { rsmi_version_str_get(RSMI_SW_COMP_DRIVER, driver, len); } /* * Get the version of the ROCM-SMI library * * version (OUT) A string to return version of RSMI * len (OUT) Length for version of RSMI */ static void _rsmi_get_version(char *version, unsigned int len) { const char *status_string; rsmi_version_t rsmi_version; rsmi_status_t rsmi_rc = rsmi_version_get(&rsmi_version); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get the version error: %s", status_string); version[0] = '\0'; } else sprintf(version, "%s", rsmi_version.build); } /* * Get the total # of GPUs in the system * * device_count (OUT) Number of available GPU devices */ static void _rsmi_get_device_count(unsigned int *device_count) { const char *status_string; rsmi_status_t rsmi_rc = rsmi_num_monitor_devices(device_count); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get device count: %s", status_string); *device_count = 0; } } /* * Get the name of the GPU * * dv_ind (IN) The device index * device_name (OUT) Name of GPU devices * size (OUT) Size of name */ static void _rsmi_get_device_name(uint32_t dv_ind, char *device_name, unsigned int size) { const char *status_string; rsmi_status_t rsmi_rc = rsmi_dev_name_get(dv_ind, device_name, size); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get name of the GPU: %s", status_string); } } /* * Get the brand of the GPU * * dv_ind (IN) The device index * device_brand (OUT) Brand of GPU devices * size (OUT) Size of name */ static void _rsmi_get_device_brand(uint32_t dv_ind, char *device_brand, unsigned int size) { const char *status_string; rsmi_status_t rsmi_rc = rsmi_dev_brand_get(dv_ind, device_brand, size); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get brand of the GPU: %s", status_string); } } /* * Retrieves minor number of the render device. Each AMD GPU will have a device node file * in form /dev/dri/renderD[minor_number]. * * dv_ind (IN) The device index * minor (OUT) minor number of device node */ static void _rsmi_get_device_minor_number(uint32_t dv_ind, unsigned int *minor) { const char *status_string; rsmi_status_t rsmi_rc = rsmi_dev_drm_render_minor_get(dv_ind, minor); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get minor number of GPU: %s", status_string); } } /* * Get the PCI Info of the GPU * * dv_ind (IN) The device index * pci (OUT) PCI Info of GPU devices */ static void _rsmi_get_device_pci_info(uint32_t dv_ind, rsmiPciInfo_t *pci) { const char *status_string; rsmi_status_t rsmi_rc = rsmi_dev_pci_id_get(dv_ind, &(pci->bdfid)); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get PCI Info of the GPU: %s", status_string); } } /* * Get the Unique ID of the GPU * * dv_ind (IN) The device index * id (OUT) Unique ID of GPU devices */ static void _rsmi_get_device_unique_id(uint32_t dv_ind, uint64_t *id) { const char *status_string; rsmi_status_t rsmi_rc = rsmi_dev_unique_id_get(dv_ind, id); if (rsmi_rc != RSMI_STATUS_SUCCESS) { rsmi_rc = rsmi_status_string(rsmi_rc, &status_string); error("RSMI: Failed to get Unique ID of the GPU: %s", status_string); } } /* * Creates and returns a gres conf list of detected AMD gpus on the node. * If an error occurs, return NULL * Caller is responsible for freeing the list. * * If the AMD ROCM-SMI API exists, then query GPU info, * so the user doesn't need to specify manually in gres.conf. * * node_config (IN/OUT) pointer of node_config_load_t passed down */ static List _get_system_gpu_list_rsmi(node_config_load_t *node_config) { unsigned int i; unsigned int device_count = 0; List gres_list_system = list_create(destroy_gres_slurmd_conf); char driver[RSMI_STRING_BUFFER_SIZE]; char version[RSMI_STRING_BUFFER_SIZE]; rsmi_init(0); _rsmi_get_driver(driver, RSMI_STRING_BUFFER_SIZE); _rsmi_get_version(version, RSMI_STRING_BUFFER_SIZE); debug("AMD Graphics Driver Version: %s", driver); debug("RSMI Library Version: %s", version); _rsmi_get_device_count(&device_count); debug2("Device count: %d", device_count); // Loop through all the GPUs on the system and add to gres_list_system for (i = 0; i < device_count; ++i) { unsigned int minor_number = 0; char *device_file = NULL; char device_name[RSMI_STRING_BUFFER_SIZE] = {0}; char device_brand[RSMI_STRING_BUFFER_SIZE] = {0}; rsmiPciInfo_t pci_info; uint64_t uuid = 0; _rsmi_get_device_name(i, device_name, RSMI_STRING_BUFFER_SIZE); _rsmi_get_device_brand(i, device_brand, RSMI_STRING_BUFFER_SIZE); _rsmi_get_device_minor_number(i, &minor_number); pci_info.bdfid = 0; _rsmi_get_device_pci_info(i, &pci_info); _rsmi_get_device_unique_id(i, &uuid); xstrfmtcat(device_file, "/dev/dri/renderD%u", minor_number); debug2("GPU index %u:", i); debug2(" Name: %s", device_name); debug2(" Brand/Type: %s", device_brand); debug2(" UUID: %lx", uuid); debug2(" PCI Domain/Bus/Device/Function: %u:%u:%u.%u", pci_info.domain, pci_info.bus, pci_info.device, pci_info.function); debug2(" Device File (minor number): %s", device_file); if (minor_number != i+128) debug("Note: GPU index %u is different from minor # %u", i, minor_number); // Print out possible memory frequencies for this device _rsmi_print_freqs(i, LOG_LEVEL_DEBUG2); add_gres_to_list(gres_list_system, "gpu", 1, node_config->cpu_cnt, NULL, NULL, device_file, device_brand, NULL); xfree(device_file); } rsmi_shut_down(); info("%u GPU system device(s) detected", device_count); return gres_list_system; } extern int gpu_p_reconfig(void) { if (slurm_get_debug_flags() & DEBUG_FLAG_GRES) log_lvl = LOG_LEVEL_INFO; else log_lvl = LOG_LEVEL_DEBUG5; return SLURM_SUCCESS; } extern List gpu_p_get_system_gpu_list(node_config_load_t *node_config) { List gres_list_system = _get_system_gpu_list_rsmi(node_config); if (!gres_list_system) error("System GPU detection failed"); return gres_list_system; } extern void gpu_p_step_hardware_init(bitstr_t *usable_gpus, char *tres_freq) { char *freq = NULL; char *tmp = NULL; xassert(tres_freq); xassert(usable_gpus); if (!usable_gpus) return; /* Job allocated no GPUs */ if (!tres_freq) return; /* No TRES frequency spec */ tmp = strstr(tres_freq, "gpu:"); if (!tmp) return; /* No GPU frequency spec */ freq = xstrdup(tmp + 4); tmp = strchr(freq, ';'); if (tmp) tmp[0] = '\0'; // Save a copy of the GPUs affected, so we can reset things afterwards FREE_NULL_BITMAP(saved_gpus); saved_gpus = bit_copy(usable_gpus); rsmi_init(0); // Set the frequency of each GPU index specified in the bitstr _set_freq(usable_gpus, freq); xfree(freq); } extern void gpu_p_step_hardware_fini(void) { if (!saved_gpus) return; // Reset the frequencies back to the hardware default _reset_freq(saved_gpus); FREE_NULL_BITMAP(saved_gpus); rsmi_shut_down(); } extern char *gpu_p_test_cpu_conv(char *cpu_range) { return NULL; }