/*****************************************************************************\ * info_job.c - job information functions for scontrol. ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette * CODE-OCEC-09-009. All rights reserved. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include #include #include #include #include #include "scontrol.h" #include "src/common/bitstring.h" #include "src/common/slurm_time.h" #include "src/common/stepd_api.h" #define POLL_SLEEP 3 /* retry interval in seconds */ /* Load current job table information into *job_buffer_pptr */ extern int scontrol_load_job(job_info_msg_t ** job_buffer_pptr, uint32_t job_id) { int error_code; static uint16_t last_show_flags = 0xffff; uint16_t show_flags = 0; job_info_msg_t * job_info_ptr = NULL; if (all_flag) show_flags |= SHOW_ALL; if (detail_flag) show_flags |= SHOW_DETAIL; if (federation_flag) show_flags |= SHOW_FEDERATION; if (local_flag) show_flags |= SHOW_LOCAL; if (sibling_flag) show_flags |= SHOW_FEDERATION | SHOW_SIBLING; if (old_job_info_ptr) { if (last_show_flags != show_flags) old_job_info_ptr->last_update = (time_t) 0; if (job_id) { error_code = slurm_load_job(&job_info_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs( old_job_info_ptr->last_update, &job_info_ptr, show_flags); } if (error_code == SLURM_SUCCESS) slurm_free_job_info_msg (old_job_info_ptr); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { job_info_ptr = old_job_info_ptr; error_code = SLURM_SUCCESS; if (quiet_flag == -1) printf ("slurm_load_jobs no change in data\n"); } } else if (job_id) { error_code = slurm_load_job(&job_info_ptr, job_id, show_flags); } else { error_code = slurm_load_jobs((time_t) NULL, &job_info_ptr, show_flags); } if (error_code == SLURM_SUCCESS) { old_job_info_ptr = job_info_ptr; if (job_id) old_job_info_ptr->last_update = (time_t) 0; last_show_flags = show_flags; *job_buffer_pptr = job_info_ptr; } return error_code; } /* * scontrol_pid_info - given a local process id, print the corresponding * slurm job id and its expected end time * IN job_pid - the local process id of interest */ extern void scontrol_pid_info(pid_t job_pid) { int error_code; uint32_t job_id = 0; time_t end_time; long rem_time; error_code = slurm_pid2jobid(job_pid, &job_id); if (error_code) { exit_code = 1; if (quiet_flag != 1) fprintf(stderr, "Failed to locate job for requested pid\n"); return; } error_code = slurm_get_end_time(job_id, &end_time); if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror("Failed to get job end time"); return; } printf("Slurm JobId=%u ends at %s\n", job_id, slurm_ctime2(&end_time)); rem_time = slurm_get_rem_time(job_id); printf("Job remaining time is %ld seconds\n", rem_time); return; } /* * scontrol_print_completing - print jobs in completing state and * associated nodes in COMPLETING or DOWN state */ extern void scontrol_print_completing (void) { int error_code, i; job_info_msg_t *job_info_msg; job_info_t *job_info; node_info_msg_t *node_info_msg; uint16_t show_flags = 0; error_code = scontrol_load_job (&job_info_msg, 0); if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror ("slurm_load_jobs error"); return; } /* Must load all nodes including hidden for cross-index * from job's node_inx to node table to work */ /*if (all_flag) Always set this flag */ show_flags |= SHOW_ALL; if (federation_flag) show_flags |= SHOW_FEDERATION; if (local_flag) show_flags |= SHOW_LOCAL; error_code = scontrol_load_nodes(&node_info_msg, show_flags); if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror ("slurm_load_nodes error"); return; } /* Scan the jobs for completing state */ job_info = job_info_msg->job_array; for (i = 0; i < job_info_msg->record_count; i++) { if (job_info[i].job_state & JOB_COMPLETING) scontrol_print_completing_job(&job_info[i], node_info_msg); } slurm_free_node_info_msg(node_info_msg); } extern void scontrol_print_completing_job(job_info_t *job_ptr, node_info_msg_t *node_info_msg) { int i, c_offset = 0; node_info_t *node_info; hostlist_t comp_nodes, down_nodes; char *node_buf; char time_str[32]; time_t completing_time = 0; comp_nodes = hostlist_create(NULL); down_nodes = hostlist_create(NULL); if (job_ptr->cluster && federation_flag && !local_flag) c_offset = get_cluster_node_offset(job_ptr->cluster, node_info_msg); for (i = 0; job_ptr->node_inx[i] != -1; i+=2) { int j = job_ptr->node_inx[i]; for (; j <= job_ptr->node_inx[i+1]; j++) { int node_inx = j + c_offset; if (node_inx >= node_info_msg->record_count) break; node_info = &(node_info_msg->node_array[node_inx]); if (IS_NODE_COMPLETING(node_info)) hostlist_push_host(comp_nodes, node_info->name); else if (IS_NODE_DOWN(node_info)) hostlist_push_host(down_nodes, node_info->name); } } fprintf(stdout, "JobId=%u ", job_ptr->job_id); slurm_make_time_str(&job_ptr->end_time, time_str, sizeof(time_str)); fprintf(stdout, "EndTime=%s ", time_str); completing_time = time(NULL) - job_ptr->end_time; secs2time_str(completing_time, time_str, sizeof(time_str)); fprintf(stdout, "CompletingTime=%s ", time_str); node_buf = hostlist_ranged_string_xmalloc(comp_nodes); if (node_buf && node_buf[0]) fprintf(stdout, "Nodes(COMPLETING)=%s ", node_buf); xfree(node_buf); node_buf = hostlist_ranged_string_xmalloc(down_nodes); if (node_buf && node_buf[0]) fprintf(stdout, "Nodes(DOWN)=%s ", node_buf); xfree(node_buf); fprintf(stdout, "\n"); hostlist_destroy(comp_nodes); hostlist_destroy(down_nodes); } extern uint16_t scontrol_get_job_state(uint32_t job_id) { job_info_msg_t * job_buffer_ptr = NULL; int error_code = SLURM_SUCCESS, i; job_info_t *job_ptr = NULL; error_code = scontrol_load_job(&job_buffer_ptr, job_id); if (error_code) { exit_code = 1; if (quiet_flag == -1) slurm_perror ("slurm_load_job error"); return NO_VAL16; } if (quiet_flag == -1) { char time_str[32]; slurm_make_time_str((time_t *)&job_buffer_ptr->last_update, time_str, sizeof(time_str)); printf("last_update_time=%s, records=%d\n", time_str, job_buffer_ptr->record_count); } job_ptr = job_buffer_ptr->job_array ; for (i = 0; i < job_buffer_ptr->record_count; i++) { if (job_ptr->job_id == job_id) return job_ptr->job_state; } if (quiet_flag == -1) printf("Could not find job %u", job_id); return NO_VAL16; } static bool _het_job_offset_match(job_info_t *job_ptr, uint32_t het_job_offset) { if ((het_job_offset == NO_VAL) || (het_job_offset == job_ptr->het_job_offset)) return true; return false; } static bool _task_id_in_job(job_info_t *job_ptr, uint32_t array_id) { bitstr_t *array_bitmap; uint32_t array_len; if ((array_id == NO_VAL) || (array_id == job_ptr->array_task_id)) return true; array_bitmap = (bitstr_t *) job_ptr->array_bitmap; if (array_bitmap == NULL) return false; array_len = bit_size(array_bitmap); if (array_id >= array_len) return false; if (bit_test(array_bitmap, array_id)) return true; return false; } /* * scontrol_print_job - print the specified job's information * IN job_id - job's id or NULL to print information about all jobs */ extern void scontrol_print_job(char * job_id_str) { int error_code = SLURM_SUCCESS, i, print_cnt = 0; uint32_t job_id = 0; uint32_t array_id = NO_VAL, het_job_offset = NO_VAL; job_info_msg_t * job_buffer_ptr = NULL; job_info_t *job_ptr = NULL; char *end_ptr = NULL; if (job_id_str) { char *tmp_job_ptr = job_id_str; /* * Check that the input is a valid job id (i.e. 123 or 123_456). */ while (*tmp_job_ptr) { if (!isdigit(*tmp_job_ptr) && (*tmp_job_ptr != '_') && (*tmp_job_ptr != '+')) { exit_code = 1; slurm_seterrno(ESLURM_INVALID_JOB_ID); if (quiet_flag != 1) slurm_perror("scontrol_print_job error"); return; } ++tmp_job_ptr; } job_id = (uint32_t) strtol (job_id_str, &end_ptr, 10); if (end_ptr[0] == '_') array_id = strtol(end_ptr + 1, &end_ptr, 10); if (end_ptr[0] == '+') het_job_offset = strtol(end_ptr + 1, &end_ptr, 10); } error_code = scontrol_load_job(&job_buffer_ptr, job_id); if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror ("slurm_load_jobs error"); return; } if (quiet_flag == -1) { char time_str[32]; slurm_make_time_str ((time_t *)&job_buffer_ptr->last_update, time_str, sizeof(time_str)); printf ("last_update_time=%s, records=%d\n", time_str, job_buffer_ptr->record_count); } for (i = 0, job_ptr = job_buffer_ptr->job_array; i < job_buffer_ptr->record_count; i++, job_ptr++) { char *save_array_str = NULL; uint32_t save_task_id = 0; if (!_het_job_offset_match(job_ptr, het_job_offset)) continue; if (!_task_id_in_job(job_ptr, array_id)) continue; if ((array_id != NO_VAL) && job_ptr->array_task_str) { save_array_str = job_ptr->array_task_str; job_ptr->array_task_str = NULL; save_task_id = job_ptr->array_task_id; job_ptr->array_task_id = array_id; } slurm_print_job_info(stdout, job_ptr, one_liner); if (save_array_str) { job_ptr->array_task_str = save_array_str; job_ptr->array_task_id = save_task_id; } print_cnt++; } if (print_cnt == 0) { if (job_id_str) { exit_code = 1; if (quiet_flag != 1) { if (array_id != NO_VAL) { printf("Job %u_%u not found\n", job_id, array_id); } else if (het_job_offset != NO_VAL) { printf("Job %u+%u not found\n", job_id, het_job_offset); } else { printf("Job %u not found\n", job_id); } } } else if (quiet_flag != 1) printf ("No jobs in the system\n"); } } /* * scontrol_print_step - print the specified job step's information * IN job_step_id_str - job step's id or NULL to print information * about all job steps */ extern void scontrol_print_step (char *job_step_id_str) { int error_code, i, print_cnt = 0; slurm_step_id_t step_id = { .job_id = 0, .step_het_comp = NO_VAL, .step_id = NO_VAL, }; uint32_t array_id = NO_VAL; char *next_str; job_step_info_response_msg_t *job_step_info_ptr; job_step_info_t * job_step_ptr; static uint32_t last_job_id = 0, last_array_id, last_step_id = 0; static job_step_info_response_msg_t *old_job_step_info_ptr = NULL; static uint16_t last_show_flags = 0xffff; uint16_t show_flags = 0; if (job_step_id_str) { step_id.job_id = (uint32_t)strtol(job_step_id_str, &next_str, 10); if (next_str[0] == '_') array_id = (uint32_t) strtol(next_str+1, &next_str, 10); else if (next_str[0] == '.') step_id.step_id = (uint32_t)strtol(next_str + 1, NULL, 10); } if (all_flag) show_flags |= SHOW_ALL; if (local_flag) show_flags |= SHOW_LOCAL; if ((old_job_step_info_ptr) && (last_job_id == step_id.job_id) && (last_array_id == array_id) && (last_step_id == step_id.step_id)) { if (last_show_flags != show_flags) old_job_step_info_ptr->last_update = (time_t) 0; error_code = slurm_get_job_steps( old_job_step_info_ptr->last_update, step_id.job_id, step_id.step_id, &job_step_info_ptr, show_flags); if (error_code == SLURM_SUCCESS) slurm_free_job_step_info_response_msg ( old_job_step_info_ptr); else if (slurm_get_errno () == SLURM_NO_CHANGE_IN_DATA) { job_step_info_ptr = old_job_step_info_ptr; error_code = SLURM_SUCCESS; if (quiet_flag == -1) printf("slurm_get_job_steps no change in data\n"); } } else { if (old_job_step_info_ptr) { slurm_free_job_step_info_response_msg ( old_job_step_info_ptr); old_job_step_info_ptr = NULL; } error_code = slurm_get_job_steps ( (time_t) 0, step_id.job_id, step_id.step_id, &job_step_info_ptr, show_flags); } if (error_code) { exit_code = 1; if (quiet_flag != 1) slurm_perror ("slurm_get_job_steps error"); return; } old_job_step_info_ptr = job_step_info_ptr; last_show_flags = show_flags; last_job_id = step_id.job_id; last_step_id = step_id.step_id; if (quiet_flag == -1) { char time_str[32]; slurm_make_time_str ((time_t *)&job_step_info_ptr->last_update, time_str, sizeof(time_str)); printf ("last_update_time=%s, records=%d\n", time_str, job_step_info_ptr->job_step_count); } for (i = 0, job_step_ptr = job_step_info_ptr->job_steps; i < job_step_info_ptr->job_step_count; i++, job_step_ptr++) { if ((array_id != NO_VAL) && (array_id != job_step_ptr->array_task_id)) continue; slurm_print_job_step_info(stdout, job_step_ptr, one_liner); print_cnt++; } if (print_cnt == 0) { if (job_step_id_str) { exit_code = 1; if (quiet_flag != 1) { char tmp_char[45]; log_build_step_id_str(&step_id, tmp_char, sizeof(tmp_char), (STEP_ID_FLAG_NO_PREFIX | STEP_ID_FLAG_NO_JOB)); if (array_id == NO_VAL) { printf("Job step %u.%s not found\n", step_id.job_id, tmp_char); } else { printf("Job step %u_%u.%s not found\n", step_id.job_id, array_id, tmp_char); } } } else if (quiet_flag != 1) printf ("No job steps in the system\n"); } } /* Return 1 on success, 0 on failure to find a jobid in the string */ static int _parse_jobid(const char *jobid_str, uint32_t *out_jobid) { char *ptr, *job; long jobid; job = xstrdup(jobid_str); ptr = xstrchr(job, '.'); if (ptr != NULL) { *ptr = '\0'; } jobid = strtol(job, &ptr, 10); if (!xstring_is_whitespace(ptr)) { fprintf(stderr, "\"%s\" does not look like a jobid\n", job); xfree(job); return 0; } *out_jobid = (uint32_t) jobid; xfree(job); return 1; } /* Return 1 on success, 0 on failure to find a stepid in the string */ static int _parse_stepid(const char *jobid_str, slurm_step_id_t *step_id) { char *ptr, *job, *step; int rc = 1; job = xstrdup(jobid_str); ptr = xstrchr(job, '.'); if (ptr == NULL) { /* did not find a period, so no step ID in this string */ xfree(job); return rc; } else { step = ptr + 1; } step_id->step_id = (uint32_t)strtol(step, &ptr, 10); step = xstrchr(ptr, '+'); if (step) { /* het step */ step++; step_id->step_het_comp = (uint32_t)strtol(step, &ptr, 10); } else step_id->step_het_comp = NO_VAL; if (!xstring_is_whitespace(ptr)) { fprintf(stderr, "\"%s\" does not look like a stepid\n", jobid_str); rc = 0; } xfree(job); return rc; } static bool _in_task_array(pid_t pid, slurmstepd_task_info_t *task_array, uint32_t task_array_count) { int i; for (i = 0; i < task_array_count; i++) { if (pid == task_array[i].pid) return true; } return false; } static void _list_pids_one_step(const char *node_name, slurm_step_id_t *step_id) { int fd; slurmstepd_task_info_t *task_info = NULL; uint32_t *pids = NULL; uint32_t count = 0; uint32_t tcount = 0; int i; uint16_t protocol_version; char tmp_char[64]; fd = stepd_connect(NULL, node_name, step_id, &protocol_version); if (fd == -1) { exit_code = 1; if (errno == ENOENT) { fprintf(stderr, "%s does not exist on this node.\n", log_build_step_id_str(step_id, tmp_char, sizeof(tmp_char), STEP_ID_FLAG_NONE)); exit_code = 1; } else { perror("Unable to connect to slurmstepd"); } return; } log_build_step_id_str(step_id, tmp_char, sizeof(tmp_char), STEP_ID_FLAG_NO_JOB | STEP_ID_FLAG_NO_PREFIX); stepd_task_info(fd, protocol_version, &task_info, &tcount); for (i = 0; i < (int)tcount; i++) { if (task_info[i].exited) continue; printf("%-8d %-8u %-8s %-7d %-8d\n", task_info[i].pid, step_id->job_id, tmp_char, task_info[i].id, task_info[i].gtid); } stepd_list_pids(fd, protocol_version, &pids, &count); for (i = 0; i < count; i++) { if (!_in_task_array((pid_t)pids[i], task_info, tcount)) { printf("%-8d %-8u %-8s %-7s %-8s\n", pids[i], step_id->job_id, tmp_char, "-", "-"); } } xfree(pids); xfree(task_info); close(fd); } static void _list_pids_all_steps(const char *node_name, slurm_step_id_t *step_id) { List steps; ListIterator itr; step_loc_t *stepd; int count = 0; char tmp_char[64]; if (step_id->step_het_comp != NO_VAL) { _list_pids_one_step(node_name, step_id); return; } steps = stepd_available(NULL, node_name); if (!steps || list_count(steps) == 0) { fprintf(stderr, "%s does not exist on node %s.\n", log_build_step_id_str(step_id, tmp_char, sizeof(tmp_char), STEP_ID_FLAG_NONE), node_name); FREE_NULL_LIST(steps); exit_code = 1; return; } itr = list_iterator_create(steps); while ((stepd = list_next(itr))) { if (step_id->job_id != stepd->step_id.job_id) continue; if ((step_id->step_id != NO_VAL) && (step_id->step_id != stepd->step_id.step_id)) continue; _list_pids_one_step(stepd->nodename, &stepd->step_id); count++; } list_iterator_destroy(itr); FREE_NULL_LIST(steps); if (count == 0) { if (step_id->step_id != NO_VAL) { fprintf(stderr, "%s does not exist on node %s.\n", log_build_step_id_str(step_id, tmp_char, sizeof(tmp_char), STEP_ID_FLAG_NONE), node_name); } else fprintf(stderr, "There are no steps for job %u on node %s.\n", step_id->job_id, node_name); exit_code = 1; } } static void _list_pids_all_jobs(const char *node_name) { List steps; ListIterator itr; step_loc_t *stepd; steps = stepd_available(NULL, node_name); if (!steps || list_count(steps) == 0) { fprintf(stderr, "No job steps exist on this node.\n"); FREE_NULL_LIST(steps); exit_code = 1; return; } itr = list_iterator_create(steps); while((stepd = list_next(itr))) { _list_pids_one_step(stepd->nodename, &stepd->step_id); } list_iterator_destroy(itr); FREE_NULL_LIST(steps); } /* * scontrol_list_pids - given a slurmd job ID or job ID + step ID, * print the process IDs of the processes each job step (or * just the specified step ID). * IN jobid_str - string representing a jobid: jobid[.stepid] * IN node_name - May be NULL, in which case it will attempt to * determine the NodeName of the local host on its own. * This is mostly of use when multiple-slurmd support is in use, * because if NULL is used when there are multiple slurmd on the * node, one of them will be selected more-or-less at random. */ extern void scontrol_list_pids(const char *jobid_str, const char *node_name) { slurm_step_id_t step_id = { .job_id = 0, .step_id = NO_VAL, .step_het_comp = NO_VAL, }; /* Job ID is optional */ if (jobid_str != NULL && jobid_str[0] != '*' && !_parse_jobid(jobid_str, &step_id.job_id)) { exit_code = 1; return; } /* Step ID is optional */ printf("%-8s %-8s %-8s %-7s %-8s\n", "PID", "JOBID", "STEPID", "LOCALID", "GLOBALID"); if (jobid_str == NULL || jobid_str[0] == '*') { _list_pids_all_jobs(node_name); } else if (_parse_stepid(jobid_str, &step_id)) _list_pids_all_steps(node_name, &step_id); } extern void scontrol_getent(const char *node_name) { List steps = NULL; ListIterator itr = NULL; step_loc_t *stepd; int fd; struct passwd *pwd = NULL; struct group **grps = NULL; if (!(steps = stepd_available(NULL, node_name))) { fprintf(stderr, "No steps found on this node\n"); return; } itr = list_iterator_create(steps); while ((stepd = list_next(itr))) { char tmp_char[45]; fd = stepd_connect(NULL, node_name, &stepd->step_id, &stepd->protocol_version); if (fd < 0) continue; pwd = stepd_getpw(fd, stepd->protocol_version, GETPW_MATCH_ALWAYS, 0, NULL); if (!pwd) { close(fd); continue; } log_build_step_id_str(&stepd->step_id, tmp_char, sizeof(tmp_char), STEP_ID_FLAG_NO_PREFIX); printf("JobId=%s:\nUser:\n", tmp_char); printf("%s:%s:%u:%u:%s:%s:%s\nGroups:\n", pwd->pw_name, pwd->pw_passwd, pwd->pw_uid, pwd->pw_gid, pwd->pw_gecos, pwd->pw_dir, pwd->pw_shell); xfree_struct_passwd(pwd); grps = stepd_getgr(fd, stepd->protocol_version, GETGR_MATCH_ALWAYS, 0, NULL); if (!grps) { close(fd); printf("\n"); continue; } for (int i = 0; grps[i]; i++) { printf("%s:%s:%u:%s\n", grps[i]->gr_name, grps[i]->gr_passwd, grps[i]->gr_gid, (grps[i]->gr_mem) ? grps[i]->gr_mem[0] : ""); } close(fd); xfree_struct_group_array(grps); printf("\n"); } list_iterator_destroy(itr); FREE_NULL_LIST(steps); } /* * scontrol_print_hosts - given a node list expression, return * a list of nodes, one per line */ extern void scontrol_print_hosts (char * node_list) { hostlist_t hl; char *host; if (!node_list) { error("host list is empty"); return; } hl = hostlist_create_dims(node_list, 0); if (!hl) { fprintf(stderr, "Invalid hostlist: %s\n", node_list); return; } while ((host = hostlist_shift_dims(hl, 0))) { printf("%s\n", host); free(host); } hostlist_destroy(hl); } /* Replace '\n' with ',', remove duplicate comma */ static void _reformat_hostlist(char *hostlist) { int i, o; for (i=0; (hostlist[i] != '\0'); i++) { if (hostlist[i] == '\n') hostlist[i] = ','; } o = 0; for (i=0; (hostlist[i] != '\0'); i++) { while ((hostlist[i] == ',') && (hostlist[i+1] == ',')) i++; hostlist[o++] = hostlist[i]; } hostlist[o] = '\0'; } /* * scontrol_encode_hostlist - given a list of hostnames or the pathname * of a file containing hostnames, translate them into a hostlist * expression */ extern int scontrol_encode_hostlist(char *hostlist, bool sorted) { char *io_buf = NULL, *tmp_list, *ranged_string; int buf_size = 1024 * 1024; hostlist_t hl; if (!hostlist) { fprintf(stderr, "Hostlist is NULL\n"); return SLURM_ERROR; } if (hostlist[0] == '/') { ssize_t buf_read; int fd = open(hostlist, O_RDONLY); if (fd < 0) { fprintf(stderr, "Can not open %s\n", hostlist); return SLURM_ERROR; } io_buf = xmalloc(buf_size); buf_read = read(fd, io_buf, buf_size); close(fd); if (buf_read >= buf_size) { /* If over 1MB, the file is almost certainly invalid */ fprintf(stderr, "File %s is too large\n", hostlist); xfree(io_buf); return SLURM_ERROR; } io_buf[buf_read] = '\0'; _reformat_hostlist(io_buf); tmp_list = io_buf; } else tmp_list = hostlist; hl = hostlist_create(tmp_list); if (hl == NULL) { fprintf(stderr, "Invalid hostlist: %s\n", tmp_list); xfree(io_buf); return SLURM_ERROR; } if (sorted) hostlist_sort(hl); ranged_string = hostlist_ranged_string_xmalloc(hl); printf("%s\n", ranged_string); hostlist_destroy(hl); xfree(ranged_string); xfree(io_buf); return SLURM_SUCCESS; } static int _wait_nodes_ready(uint32_t job_id) { int is_ready = SLURM_ERROR, i, rc = 0; int cur_delay = 0; int max_delay; if (!slurm_conf.suspend_timeout || !slurm_conf.resume_timeout) return SLURM_SUCCESS; /* Power save mode disabled */ max_delay = slurm_conf.suspend_timeout + slurm_conf.resume_timeout; max_delay *= 5; /* Allow for ResumeRate support */ for (i=0; (cur_delay < max_delay); i++) { if (i) { if (i == 1) info("Waiting for nodes to boot"); sleep(POLL_SLEEP); cur_delay += POLL_SLEEP; } rc = slurm_job_node_ready(job_id); if (rc == READY_JOB_FATAL) break; /* fatal error */ if ((rc == READY_JOB_ERROR) || (rc == EAGAIN)) continue; /* retry */ if ((rc & READY_JOB_STATE) == 0) /* job killed */ break; if ((rc & READY_NODE_STATE) && (rc & READY_PROLOG_STATE)) { is_ready = SLURM_SUCCESS; break; } } if (is_ready == SLURM_SUCCESS) info("Nodes are ready for job %u", job_id); else if ((rc & READY_JOB_STATE) == 0) info("Job %u no longer running", job_id); else info("Problem running job %u", job_id); return is_ready; } /* * Wait until a job is ready to execute or enters some failed state * RET 1: job ready to run * 0: job can't run (cancelled, failure state, timeout, etc.) */ extern int scontrol_job_ready(char *job_id_str) { uint32_t job_id; job_id = atoi(job_id_str); if (job_id <= 0) { fprintf(stderr, "Invalid job_id %s", job_id_str); return SLURM_ERROR; } return _wait_nodes_ready(job_id); } extern int scontrol_callerid(int argc, char **argv) { int af, ver = 4; unsigned char ip_src[sizeof(struct in6_addr)], ip_dst[sizeof(struct in6_addr)]; uint32_t port_src, port_dst, job_id; network_callerid_msg_t req; char node_name[MAXHOSTNAMELEN], *ptr; if (argc == 5) { ver = strtoul(argv[4], &ptr, 0); if (ptr && ptr[0]) { error("Address family not an integer"); return SLURM_ERROR; } } if (ver != 4 && ver != 6) { error("Invalid address family: %d", ver); return SLURM_ERROR; } af = ver == 4 ? AF_INET : AF_INET6; if (!inet_pton(af, argv[0], ip_src)) { error("inet_pton failed for '%s'", argv[0]); return SLURM_ERROR; } port_src = strtoul(argv[1], &ptr, 0); if (ptr && ptr[0]) { error("Source port not an integer"); return SLURM_ERROR; } if (!inet_pton(af, argv[2], ip_dst)) { error("scontrol_callerid: inet_pton failed for '%s'", argv[2]); return SLURM_ERROR; } port_dst = strtoul(argv[3], &ptr, 0); if (ptr && ptr[0]) { error("Destination port not an integer"); return SLURM_ERROR; } memcpy(req.ip_src, ip_src, 16); memcpy(req.ip_dst, ip_dst, 16); req.port_src = port_src; req.port_dst = port_dst; req.af = af; if (slurm_network_callerid(req, &job_id, node_name, MAXHOSTNAMELEN) != SLURM_SUCCESS) { fprintf(stderr, "slurm_network_callerid: unable to retrieve callerid data from remote slurmd\n"); return SLURM_ERROR; } else if (job_id == NO_VAL) { fprintf(stderr, "slurm_network_callerid: remote job id indeterminate\n"); return SLURM_ERROR; } else { printf("%u %s\n", job_id, node_name); return SLURM_SUCCESS; } } extern int scontrol_batch_script(int argc, char **argv) { char *filename; FILE *out; int exit_code; uint32_t jobid; if (argc < 1) return SLURM_ERROR; jobid = atoll(argv[0]); if (argc > 1) filename = xstrdup(argv[1]); else filename = xstrdup_printf("slurm-%u.sh", jobid); if (!xstrcmp(filename, "-")) { out = stdout; } else { if (!(out = fopen(filename, "w"))) { fprintf(stderr, "failed to open file `%s`: %m\n", filename); xfree(filename); return errno; } } exit_code = slurm_job_batch_script(out, jobid); if (out != stdout) fclose(out); if (exit_code != SLURM_SUCCESS) { if (out != stdout) unlink(filename); slurm_perror("job script retrieval failed"); } else if ((out != stdout) && (quiet_flag != 1)) { printf("batch script for job %u written to %s\n", jobid, filename); } xfree(filename); return exit_code; }