/*****************************************************************************\ * multi_prog.c - executing program according to task rank * set MPIR_PROCDESC accordingly * * NOTE: The logic could be eliminated if slurmstepd kept track of the * executable name for each task and returned that inforatmion in a new * launch response message (with multiple executable names). ***************************************************************************** * Produced at National University of Defense Technology (China) * Written by Hongjia Cao * and * Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette . * CODE-OCEC-09-009. All rights reserved. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "config.h" #include #include #include #include #include #include #include #include "src/common/bitstring.h" #include "src/common/log.h" #include "src/common/xassert.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" #include "src/common/proc_args.h" #include "debugger.h" #include "multi_prog.h" #include "opt.h" static void _set_range(int low_num, int high_num, char *exec_name, bool ignore_duplicates) { int i; for (i = low_num; i <= high_num; i++) { MPIR_PROCDESC *tv; tv = &MPIR_proctable[i]; if (tv->executable_name == NULL) { tv->executable_name = xstrdup(exec_name); } else if (!ignore_duplicates) { error("duplicate configuration for task %d ignored", i); } } } static void _set_exec_names(char *ranks, char *exec_name, int ntasks) { char *ptrptr = NULL; int low_num, high_num, num, i; if ((ranks[0] == '*') && (ranks[1] == '\0')) { low_num = 0; high_num = ntasks - 1; _set_range(low_num, high_num, exec_name, true); return; } ptrptr = ranks; for (i=0; iexecutable_name = NULL; } config_fd = fopen(config_fname, "r"); if (config_fd == NULL) { error("Unable to open configuration file %s", config_fname); return -1; } while (fgets(line, sizeof(line), config_fd)) { line_num ++; line_len = strlen(line); if (line_len >= (sizeof(line) - 1)) { error ("Line %d of configuration file %s too long", line_num, config_fname); fclose(config_fd); return -1; } if ((line_len > 0 && line[line_len - 1] == '\\') || /* EOF */ (line_len > 1 && line[line_len - 2] == '\\' && line[line_len - 1] == '\n')) line_break = true; else line_break = false; if (last_line_break) { last_line_break = line_break; continue; } last_line_break = line_break; p = line; while (*p != '\0' && isspace (*p)) /* remove leading spaces */ p ++; if (*p == '#') /* only whole-line comments handled */ continue; if (*p == '\0') /* blank line ignored */ continue; ranks = strtok_r(p, " \t\n", &ptrptr); exec_name = strtok_r(NULL, " \t\n", &ptrptr); if (!ranks || !exec_name) { error("Line %d of configuration file %s is invalid", line_num, config_fname); fclose(config_fd); return -1; } _set_exec_names(ranks, exec_name, ntasks); } fclose(config_fd); return 0; } extern void mpir_init(int num_tasks) { MPIR_proctable_size = num_tasks; MPIR_proctable = xmalloc(sizeof(MPIR_PROCDESC) * num_tasks); if (MPIR_proctable == NULL) { error("Unable to initialize MPIR_proctable: %m"); exit(error_exit); } } extern void mpir_cleanup(void) { int i; for (i = 0; i < MPIR_proctable_size; i++) { xfree(MPIR_proctable[i].host_name); xfree(MPIR_proctable[i].executable_name); } xfree(MPIR_proctable); } extern void mpir_set_executable_names(const char *executable_name, uint32_t task_offset, uint32_t task_count) { int i; if (task_offset == NO_VAL) task_offset = 0; xassert((task_offset + task_count) <= MPIR_proctable_size); for (i = task_offset; i < (task_offset + task_count); i++) { MPIR_proctable[i].executable_name = xstrdup(executable_name); // info("NAME[%d]:%s", i, executable_name); } } extern void mpir_dump_proctable(void) { MPIR_PROCDESC *tv; int i; for (i = 0; i < MPIR_proctable_size; i++) { tv = &MPIR_proctable[i]; info("task:%d, host:%s, pid:%d, executable:%s", i, tv->host_name, tv->pid, tv->executable_name); } } static int _update_task_mask(int low_num, int high_num, slurm_opt_t *opt_local, bitstr_t **task_mask, bool ignore_duplicates) { int i; if (low_num > high_num) { error("Invalid task range, %d-%d", low_num, high_num); return -1; } if (low_num < 0) { error("Invalid task id, %d < 0", low_num); return -1; } if (high_num >= opt_local->ntasks) { static bool i_set_ntasks = false; if (opt_local->ntasks_set && !i_set_ntasks) { error("Invalid task id, %d >= ntasks", high_num); return -1; } else { opt_local->ntasks = high_num + 1; opt_local->ntasks_set = true; i_set_ntasks = true; (*task_mask) = bit_realloc((*task_mask), opt_local->ntasks); } } for (i=low_num; i<=high_num; i++) { if (bit_test((*task_mask), i)) { if (ignore_duplicates) continue; error("Duplicate record for task %d", i); return -1; } bit_set((*task_mask), i); } return 0; } static int _validate_ranks(char *ranks, slurm_opt_t *opt_local, bitstr_t **task_mask) { static bool has_asterisk = false; char *range = NULL, *p = NULL; char *ptrptr = NULL, *upper = NULL; int low_num, high_num; if (ranks[0] == '*' && ranks[1] == '\0') { low_num = 0; high_num = opt_local->ntasks - 1; opt_local->ntasks_set = true; /* do not allow to change later */ has_asterisk = true; /* must be last MPMD spec line */ opt_local->srun_opt->multi_prog_cmds++; return _update_task_mask(low_num, high_num, opt_local, task_mask, true); } for (range = strtok_r(ranks, ",", &ptrptr); range != NULL; range = strtok_r(NULL, ",", &ptrptr)) { /* * Non-contiguous tasks are split into multiple commands * in the mpmd_set so count each token separately */ opt_local->srun_opt->multi_prog_cmds++; p = range; while (*p != '\0' && isdigit (*p)) p ++; if (has_asterisk) { error("Task range specification with asterisk must " "be last"); return -1; } else if (*p == '\0') { /* single rank */ low_num = atoi(range); high_num = low_num; } else if (*p == '-') { /* lower-upper */ upper = ++ p; while (isdigit (*p)) p ++; if (*p != '\0') { error ("Invalid task range specification"); return -1; } low_num = atoi(range); high_num = atoi(upper); } else { error ("Invalid task range specification (%s)", range); return -1; } if (_update_task_mask(low_num, high_num, opt_local, task_mask, false)) return -1; } return 0; } /* * Verify that we have a valid executable program specified for each task * when the --multi-prog option is used. * IN config_name - MPMD configuration file name * IN/OUT opt_local - slurm options * RET 0 on success, -1 otherwise */ extern int verify_multi_name(char *config_fname, slurm_opt_t *opt_local) { FILE *config_fd; char line[BUF_SIZE]; char *ranks, *exec_name, *p, *ptrptr, *fullpath = NULL; int line_num = 0, i, rc = 0; bool last_line_break = false, line_break = false; int line_len; bitstr_t *task_mask; if (opt_local->ntasks <= 0) { error("Invalid task count %d", opt_local->ntasks); return -1; } opt_local->srun_opt->multi_prog_cmds = 0; config_fd = fopen(config_fname, "r"); if (config_fd == NULL) { error("Unable to open configuration file %s", config_fname); return -1; } task_mask = bit_alloc(opt_local->ntasks); while (fgets(line, sizeof(line), config_fd)) { line_num++; line_len = strlen(line); if (line_len >= (sizeof(line) - 1)) { error ("Line %d of configuration file %s too long", line_num, config_fname); rc = -1; goto fini; } if ((line_len > 0 && line[line_len - 1] == '\\') || /* EOF */ (line_len > 1 && line[line_len - 2] == '\\' && line[line_len - 1] == '\n')) line_break = true; else line_break = false; if (last_line_break) { last_line_break = line_break; continue; } last_line_break = line_break; p = line; while (*p != '\0' && isspace (*p)) /* remove leading spaces */ p ++; if (*p == '#') /* only whole-line comments handled */ continue; if (*p == '\0') /* blank line ignored */ continue; ranks = strtok_r(p, " \t\n", &ptrptr); exec_name = strtok_r(NULL, " \t\n", &ptrptr); if (!ranks || !exec_name) { error("Line %d of configuration file %s invalid", line_num, config_fname); rc = -1; goto fini; } if (_validate_ranks(ranks, opt_local, &task_mask)) { error("Line %d of configuration file %s invalid", line_num, config_fname); rc = -1; goto fini; } if (opt_local->srun_opt->test_exec && !(fullpath = search_path( opt_local->chdir, exec_name, true, X_OK, true))) { error("Line %d of configuration file %s, program %s not executable", line_num, config_fname, exec_name); rc = -1; goto fini; } xfree(fullpath); } for (i = 0; i < opt_local->ntasks; i++) { if (!bit_test(task_mask, i)) { error("Configuration file %s invalid, " "no record for task id %d", config_fname, i); rc = -1; goto fini; } } fini: fclose(config_fd); FREE_NULL_BITMAP(task_mask); return rc; }