/*****************************************************************************\ * slurm_route.c - route plugin functions. ***************************************************************************** * Copyright (C) 2014 Bull S. A. S. * Bull, Rue Jean Jaures, B.P.68, 78340, Les Clayes-sous-Bois. * * Written by Rod Schultz * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "config.h" #include #include #include #include /* MAXPATHLEN */ #include "slurm/slurm.h" #include "src/common/log.h" #include "src/common/forward.h" #include "src/common/node_conf.h" #include "src/common/plugrack.h" #include "src/common/read_config.h" #include "src/common/slurm_protocol_api.h" #include "src/common/slurm_route.h" #include "src/common/timers.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" strong_alias(route_split_hostlist_treewidth, slurm_route_split_hostlist_treewidth); typedef struct slurm_route_ops { int (*split_hostlist) (hostlist_t hl, hostlist_t** sp_hl, int* count, uint16_t tree_width); int (*reconfigure) (void); slurm_addr_t* (*next_collector) (bool* is_collector); slurm_addr_t* (*next_collector_backup) (void); } slurm_route_ops_t; /* * Must be synchronized with slurm_route_ops_t above. */ static const char *syms[] = { "route_p_split_hostlist", "route_p_reconfigure", "route_p_next_collector", "route_p_next_collector_backup" }; static slurm_route_ops_t ops; static plugin_context_t *g_context = NULL; static pthread_mutex_t g_context_lock = PTHREAD_MUTEX_INITIALIZER; static bool init_run = false; static uint32_t debug_flags = 0; static uint16_t g_tree_width; static bool this_is_collector = false; /* this node is a collector node */ static slurm_addr_t *msg_collect_node = NULL; /* address of node to aggregate messages from this node */ /* addresses of backup nodes to aggregate messages from this node */ static uint32_t msg_backup_cnt = 0; static slurm_addr_t **msg_collect_backup = NULL; /* _get_all_nodes creates a hostlist containing all the nodes in the * node_record_table. * * Caller must destroy the list. */ static hostlist_t _get_all_nodes( void ) { int i; hostlist_t nodes = hostlist_create(NULL); for (i = 0; i < node_record_count; i++) { hostlist_push_host(nodes, node_record_table_ptr[i].name); } return nodes; } /* * _set_collectors call the split_hostlist API on the all nodes hostlist * to set the node to be used as a collector for unsolicited node aggregation. * * If this node is a forwarding node (first node in any hostlist), * then its collector and backup are the ControlMachine and it's backup. * * Otherwise, we find the hostlist containing this node. * The forwarding node in that hostlist becomes a collector, the next node * which is not this node becomes the backup. * That list is split, we iterate through it and searching for a list in * which this node is a forwarding node. If found, we set the collector and * backup, else this process is repeated. */ static void _set_collectors(char *this_node_name) { slurm_ctl_conf_t *conf; hostlist_t nodes; hostlist_t *hll = NULL; uint32_t backup_cnt; char *parent = NULL, **backup; char addrbuf[32]; int i, j, f = -1; int hl_count = 0; uint16_t parent_port; uint16_t backup_port; bool ctldparent = true; char *tmp = NULL; #ifdef HAVE_FRONT_END return; /* on a FrontEnd system this would never be useful. */ #endif if (!running_in_slurmd()) return; /* Only compute nodes have collectors */ /* * Set the initial iteration, collector is controller, * full list is split */ xassert(this_node_name); conf = slurm_conf_lock(); nodes = _get_all_nodes(); backup_cnt = conf->control_cnt; backup = xmalloc(sizeof(char *) * backup_cnt); if (conf->slurmctld_addr) { parent = strdup(conf->slurmctld_addr); backup_cnt = 1; } else parent = strdup(conf->control_addr[0]); for (i = 0; i < backup_cnt; i++) { if (conf->control_addr[i]) backup[i] = xstrdup(conf->control_addr[i]); else backup[i] = NULL; } msg_backup_cnt = backup_cnt + 2; msg_collect_backup = xmalloc(sizeof(slurm_addr_t *) * msg_backup_cnt); parent_port = conf->slurmctld_port; backup_port = parent_port; slurm_conf_unlock(); while (1) { if (route_g_split_hostlist(nodes, &hll, &hl_count, 0)) { error("unable to split forward hostlist"); goto clean; /* collector addrs remains null */ } /* Find which hostlist contains this node */ for (i = 0; i < hl_count; i++) { f = hostlist_find(hll[i], this_node_name); if (f != -1) break; } if (i == hl_count) { fatal("ROUTE -- %s not found in node_record_table", this_node_name); } if (f == 0) { /* * we are a forwarded to node, * so our parent is "parent" */ if (hostlist_count(hll[i]) > 1) this_is_collector = true; xfree(msg_collect_node); msg_collect_node = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) { slurm_set_addr(msg_collect_node, parent_port, parent); } else { slurm_conf_get_addr(parent, msg_collect_node, 0); msg_collect_node->sin_port = htons(parent_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr(msg_collect_node, addrbuf, 32); info("ROUTE -- message collector (%s) address is %s", parent, addrbuf); } msg_backup_cnt = 0; xfree(msg_collect_backup[0]); for (i = 1; (i < backup_cnt) && backup[i]; i++) { msg_backup_cnt = i; msg_collect_backup[i-1] = xmalloc(sizeof(slurm_addr_t)); if (ctldparent) { slurm_set_addr(msg_collect_backup[i-1], backup_port, backup[i]); } else { slurm_conf_get_addr(backup[i], msg_collect_backup[i-1], 0); msg_collect_backup[i-1]->sin_port = htons(backup_port); } if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr( msg_collect_backup[i-1], addrbuf, 32); info("ROUTE -- message collector backup[%d] (%s) " "address is %s", i, backup[i], addrbuf); } } if ((i == 1) && (debug_flags & DEBUG_FLAG_ROUTE)) info("ROUTE -- no message collector backup"); goto clean; } /* * We are not a forwarding node, the first node in this list * will split the forward_list. * We also know that the forwarding node is not a controller. * * clean up parent context */ ctldparent = false; hostlist_destroy(nodes); nodes = hostlist_copy(hll[i]); for (j = 0; j < hl_count; j++) { hostlist_destroy(hll[j]); } xfree(hll); /* set our parent, backup, and continue search */ for (i = 0; i < backup_cnt; i++) xfree(backup[i]); if (parent) free(parent); parent = hostlist_shift(nodes); tmp = hostlist_nth(nodes, 0); backup[0] = xstrdup(tmp); free(tmp); tmp = NULL; if (xstrcmp(backup[0], this_node_name) == 0) { xfree(backup[0]); if (hostlist_count(nodes) > 1) { tmp = hostlist_nth(nodes, 1); backup[0] = xstrdup(tmp); free(tmp); tmp = NULL; } } parent_port = slurm_conf_get_port(parent); if (backup[0]) backup_port = slurm_conf_get_port(backup[0]); else backup_port = 0; } clean: if (debug_flags & DEBUG_FLAG_ROUTE) { slurm_print_slurm_addr(msg_collect_node, addrbuf, 32); xstrfmtcat(tmp, "ROUTE -- %s is a %s node (parent:%s", this_node_name, this_is_collector ? "collector" : "leaf", addrbuf); for (i = 0; (i < backup_cnt) && msg_collect_backup[i]; i++) { slurm_print_slurm_addr(msg_collect_backup[i], addrbuf, 32); xstrfmtcat(tmp, " backup[%d]:%s", i, addrbuf); } info("%s)", tmp); xfree(tmp); } hostlist_destroy(nodes); if (parent) free(parent); for (i = 0; i < backup_cnt; i++) xfree(backup[i]); xfree(backup); for (i = 0; i < hl_count; i++) { hostlist_destroy(hll[i]); } xfree(hll); } extern int route_init(char *node_name) { int retval = SLURM_SUCCESS; char *plugin_type = "route"; char *type = NULL; if (init_run && g_context) return retval; slurm_mutex_lock(&g_context_lock); if (g_context) goto done; type = slurm_get_route_plugin(); g_context = plugin_context_create( plugin_type, type, (void **)&ops, syms, sizeof(syms)); if (!g_context) { error("cannot create %s context for %s", plugin_type, type); retval = SLURM_ERROR; goto done; } g_tree_width = slurm_get_tree_width(); debug_flags = slurm_get_debug_flags(); init_run = true; _set_collectors(node_name); done: slurm_mutex_unlock(&g_context_lock); xfree(type); return retval; } extern int route_fini(void) { int i, rc; if (!g_context) return SLURM_SUCCESS; init_run = false; rc = plugin_context_destroy(g_context); g_context = NULL; xfree(msg_collect_node); for (i = 0; i < msg_backup_cnt; i++) xfree(msg_collect_backup[i]); xfree(msg_collect_backup); msg_backup_cnt = 0; return rc; } /* * route_g_split_hostlist - logic to split an input hostlist into * a set of hostlists to forward to. * * IN: hl - hostlist_t - list of every node to send message to * will be empty on return which is same behavior * as similar code replaced in forward.c * OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced * OUT: count - int* - the count of created hostlists * RET: SLURM_SUCCESS - int * * Note: created hostlist will have to be freed independently using * hostlist_destroy by the caller. * Note: the hostlist_t array will have to be xfree. */ extern int route_g_split_hostlist(hostlist_t hl, hostlist_t** sp_hl, int* count, uint16_t tree_width) { int rc; int j, nnodes, nnodex; char *buf; nnodes = nnodex = 0; if (route_init(NULL) != SLURM_SUCCESS) return SLURM_ERROR; if (debug_flags & DEBUG_FLAG_ROUTE) { /* nnodes has to be set here as the hl is empty after the * split_hostlise call. */ nnodes = hostlist_count(hl); buf = hostlist_ranged_string_xmalloc(hl); info("ROUTE: split_hostlist: hl=%s tree_width %u", buf, tree_width); xfree(buf); } rc = (*(ops.split_hostlist))(hl, sp_hl, count, tree_width ? tree_width : g_tree_width); if (debug_flags & DEBUG_FLAG_ROUTE) { /* Sanity check to make sure all nodes in msg list are in * a child list */ nnodex = 0; for (j = 0; j < *count; j++) { nnodex += hostlist_count((*sp_hl)[j]); } if (nnodex != nnodes) { /* CLANG false positive */ info("ROUTE: number of nodes in split lists (%d)" " is not equal to number in input list (%d)", nnodex, nnodes); } } return rc; } /* * route_g_reconfigure - reset during reconfigure * * RET: SLURM_SUCCESS - int */ extern int route_g_reconfigure(void) { if (route_init(NULL) != SLURM_SUCCESS) return SLURM_ERROR; debug_flags = slurm_get_debug_flags(); g_tree_width = slurm_get_tree_width(); return (*(ops.reconfigure))(); } /* * route_g_next_collector - return address of next collector * * IN: is_collector - bool* - flag indication if this node is a collector * * RET: slurm_addr_t* - address of node to send messages to be aggregated. */ extern slurm_addr_t* route_g_next_collector(bool *is_collector) { if (route_init(NULL) != SLURM_SUCCESS) return NULL; return (*(ops.next_collector))(is_collector); } /* * route_g_next_collector_backup * * RET: slurm_addr_t* - address of backup node to send messages to be aggregated. */ extern slurm_addr_t* route_g_next_collector_backup(void) { if (route_init(NULL) != SLURM_SUCCESS) return NULL; return (*(ops.next_collector_backup))(); } /* * route_split_hostlist_treewidth - logic to split an input hostlist into * a set of hostlists to forward to. * * This is the default behavior. It is implemented here as there are cases * where the topology version also needs to split the message list based * on TreeWidth. * * IN: hl - hostlist_t - list of every node to send message to * will be empty on return which is same behavior * as similar code replaced in forward.c * OUT: sp_hl - hostlist_t** - the array of hostlists that will be malloced * OUT: count - int* - the count of created hostlists * RET: SLURM_SUCCESS - int * * Note: created hostlist will have to be freed independently using * hostlist_destroy by the caller. * Note: the hostlist_t array will have to be xfree. */ extern int route_split_hostlist_treewidth(hostlist_t hl, hostlist_t** sp_hl, int* count, uint16_t tree_width) { int host_count; int *span = NULL; char *name = NULL; char *buf; int nhl = 0; int j; if (!tree_width) tree_width = g_tree_width; host_count = hostlist_count(hl); span = set_span(host_count, tree_width); *sp_hl = xmalloc(tree_width * sizeof(hostlist_t)); while ((name = hostlist_shift(hl))) { (*sp_hl)[nhl] = hostlist_create(name); free(name); for (j = 0; j < span[nhl]; j++) { name = hostlist_shift(hl); if (!name) { break; } hostlist_push_host((*sp_hl)[nhl], name); free(name); } if (debug_flags & DEBUG_FLAG_ROUTE) { buf = hostlist_ranged_string_xmalloc((*sp_hl)[nhl]); debug("ROUTE: ... sublist[%d] %s", nhl, buf); xfree(buf); } nhl++; } xfree(span); *count = nhl; return SLURM_SUCCESS; } /* * route_next_collector - get collector node address based * * IN: is_collector - bool* - flag indication if this node is a collector * * RET: slurm_addr_t* - address of node to send messages to be aggregated. */ extern slurm_addr_t* route_next_collector(bool *is_collector) { *is_collector = this_is_collector; return msg_collect_node; } /* * route_next_collector_backup - get collector backup address based on offset * * backup_inx IN - Backup server index (between 1 and msg_backup_cnt-1) * RET: slurm_addr_t* - address of backup node to send messages to be aggregated */ extern slurm_addr_t* route_next_collector_backup(int backup_inx) { if ((backup_inx <= 0) || (backup_inx >= msg_backup_cnt)) return NULL; return msg_collect_backup[backup_inx]; }