/*****************************************************************************\ * job_mgr.c - manage the job information of slurm * Note: there is a global job list (job_list), time stamp * (last_job_update), and hash table (job_hash) ***************************************************************************** * Copyright (C) 2002-2007 The Regents of the University of California. * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Portions Copyright (C) 2010-2017 SchedMD . * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette * CODE-OCEC-09-009. All rights reserved. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "config.h" #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include "slurm/slurm_errno.h" #include "src/common/slurm_acct_gather.h" #include "src/common/assoc_mgr.h" #include "src/common/bitstring.h" #include "src/common/cpu_frequency.h" #include "src/common/fd.h" #include "src/common/forward.h" #include "src/common/gres.h" #include "src/common/hostlist.h" #include "src/common/node_features.h" #include "src/common/node_select.h" #include "src/common/parse_time.h" #include "src/common/power.h" #include "src/common/slurm_accounting_storage.h" #include "src/common/slurm_auth.h" #include "src/common/slurm_jobcomp.h" #include "src/common/slurm_mcs.h" #include "src/common/slurm_priority.h" #include "src/common/slurm_protocol_pack.h" #include "src/common/switch.h" #include "src/common/timers.h" #include "src/common/track_script.h" #include "src/common/tres_bind.h" #include "src/common/tres_frequency.h" #include "src/common/uid.h" #include "src/common/xassert.h" #include "src/common/xstring.h" #include "src/slurmctld/acct_policy.h" #include "src/slurmctld/agent.h" #include "src/slurmctld/burst_buffer.h" #include "src/slurmctld/fed_mgr.h" #include "src/slurmctld/front_end.h" #include "src/slurmctld/gang.h" #include "src/slurmctld/job_scheduler.h" #include "src/slurmctld/job_submit.h" #include "src/slurmctld/licenses.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/node_scheduler.h" #include "src/slurmctld/preempt.h" #include "src/slurmctld/proc_req.h" #include "src/slurmctld/reservation.h" #include "src/slurmctld/sched_plugin.h" #include "src/slurmctld/slurmctld.h" #include "src/slurmctld/slurmctld_plugstack.h" #include "src/slurmctld/srun_comm.h" #include "src/slurmctld/state_save.h" #include "src/slurmctld/trigger_mgr.h" #define ARRAY_ID_BUF_SIZE 32 #define DETAILS_FLAG 0xdddd #define MAX_EXIT_VAL 255 /* Maximum value returned by WIFEXITED() */ #define SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0 0 #define TOP_PRIORITY 0xffff0000 /* large, but leave headroom for higher */ #define PURGE_OLD_JOB_IN_SEC 2592000 /* 30 days in seconds */ #define JOB_HASH_INX(_job_id) (_job_id % hash_table_size) #define JOB_ARRAY_HASH_INX(_job_id, _task_id) \ ((_job_id + _task_id) % hash_table_size) /* No need to change we always pack SLURM_PROTOCOL_VERSION */ #define JOB_STATE_VERSION "PROTOCOL_VERSION" #define JOB_CKPT_VERSION "PROTOCOL_VERSION" typedef enum { JOB_HASH_JOB, JOB_HASH_ARRAY_JOB, JOB_HASH_ARRAY_TASK, } job_hash_type_t; typedef struct { int resp_array_cnt; int resp_array_size; uint32_t *resp_array_rc; bitstr_t **resp_array_task_id; } resp_array_struct_t; typedef struct { Buf buffer; uint32_t filter_uid; uint32_t *jobs_packed; uint16_t protocol_version; uint16_t show_flags; uid_t uid; } _foreach_pack_job_info_t; typedef struct { bitstr_t *node_map; int rc; } job_overlap_args_t; /* Global variables */ List job_list = NULL; /* job_record list */ time_t last_job_update; /* time of last update to job records */ List purge_files_list = NULL; /* job files to delete */ /* Local variables */ static int bf_min_age_reserve = 0; static uint32_t delay_boot = 0; static uint32_t highest_prio = 0; static uint32_t lowest_prio = TOP_PRIORITY; static int hash_table_size = 0; static int job_count = 0; /* job's in the system */ static uint32_t job_id_sequence = 0; /* first job_id to assign new job */ static struct job_record **job_hash = NULL; static struct job_record **job_array_hash_j = NULL; static struct job_record **job_array_hash_t = NULL; static bool kill_invalid_dep; static time_t last_file_write_time = (time_t) 0; static uint32_t max_array_size = NO_VAL; static bitstr_t *requeue_exit = NULL; static bitstr_t *requeue_exit_hold = NULL; static bool validate_cfgd_licenses = true; /* Local functions */ static void _add_job_hash(job_record_t *job_ptr); static void _add_job_array_hash(job_record_t *job_ptr); static void _clear_job_gres_details(job_record_t *job_ptr); static int _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id); static int _copy_job_desc_to_job_record(job_desc_msg_t * job_desc, job_record_t **job_ptr, bitstr_t ** exc_bitmap, bitstr_t ** req_bitmap); static char *_copy_nodelist_no_dup(char *node_list); static job_record_t *_create_job_record(uint32_t num_jobs); static void _delete_job_details(job_record_t *job_entry); static slurmdb_qos_rec_t *_determine_and_validate_qos( char *resv_name, slurmdb_assoc_rec_t *assoc_ptr, bool operator, slurmdb_qos_rec_t *qos_rec, int *error_code, bool locked, log_level_t log_lvl); static void _dump_job_details(struct job_details *detail_ptr, Buf buffer); static void _dump_job_state(job_record_t *dump_job_ptr, Buf buffer); static void _dump_job_fed_details(job_fed_details_t *fed_details_ptr, Buf buffer); static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src); static void _get_batch_job_dir_ids(List batch_dirs); static bool _get_whole_hetjob(void); static void _job_array_comp(job_record_t *job_ptr, bool was_running, bool requeue); static int _job_create(job_desc_msg_t * job_specs, int allocate, int will_run, job_record_t **job_rec_ptr, uid_t submit_uid, char **err_msg, uint16_t protocol_version); static void _job_timed_out(job_record_t *job_ptr, bool preempted); static void _kill_dependent(job_record_t *job_ptr); static void _list_delete_job(void *job_entry); static int _list_find_job_old(void *job_entry, void *key); static int _load_job_details(job_record_t *job_ptr, Buf buffer, uint16_t protocol_version); static int _load_job_fed_details(job_fed_details_t **fed_details_pptr, Buf buffer, uint16_t protocol_version); static int _load_job_state(Buf buffer, uint16_t protocol_version); static bitstr_t *_make_requeue_array(char *conf_buf); static uint32_t _max_switch_wait(uint32_t input_wait); static void _notify_srun_missing_step(job_record_t *job_ptr, int node_inx, time_t now, time_t node_boot_time); static Buf _open_job_state_file(char **state_file); static time_t _get_last_job_state_write_time(void); static void _pack_default_job_details(job_record_t *job_ptr, Buf buffer, uint16_t protocol_version); static void _pack_pending_job_details(struct job_details *detail_ptr, Buf buffer, uint16_t protocol_version); static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max); static void _purge_missing_jobs(int node_inx, time_t now); static int _read_data_array_from_file(int fd, char *file_name, char ***data, uint32_t *size, job_record_t *job_ptr); static void _remove_defunct_batch_dirs(List batch_dirs); static void _remove_job_hash(job_record_t *job_ptr, job_hash_type_t type); static int _reset_detail_bitmaps(job_record_t *job_ptr); static void _reset_step_bitmaps(job_record_t *job_ptr); static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr, uint32_t rc); static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id, uint32_t task_id, uint32_t rc); static void _resp_array_free(resp_array_struct_t *resp); static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp, uint32_t job_id); static int _resume_job_nodes(job_record_t *job_ptr, bool indf_susp); static void _send_job_kill(job_record_t *job_ptr); static int _set_job_id(job_record_t *job_ptr); static void _set_job_requeue_exit_value(job_record_t *job_ptr); static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal, uint16_t flags); static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags); static void _suspend_job(job_record_t *job_ptr, uint16_t op, bool indf_susp); static int _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp); static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset); static int _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid, bitstr_t *req_bitmap, part_record_t *part_ptr, List part_ptr_list, slurmdb_assoc_rec_t *assoc_ptr, slurmdb_qos_rec_t *qos_ptr); static int _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate, uid_t submit_uid, part_record_t *part_ptr, List part_list); static void _validate_job_files(List batch_dirs); static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg, part_record_t *part_ptr, List part_list); static bool _valid_pn_min_mem(job_desc_msg_t * job_desc_msg, part_record_t *part_ptr); static int _write_data_to_file(char *file_name, char *data); static int _write_data_array_to_file(char *file_name, char **data, uint32_t size); static void _xmit_new_end_time(job_record_t *job_ptr); static char *_get_mail_user(const char *user_name, uid_t user_id) { char *mail_user = NULL; if (!user_name || (user_name[0] == '\0')) { mail_user = uid_to_string(user_id); /* unqualified sender, append MailDomain if set */ if (slurmctld_conf.mail_domain) { xstrfmtcat(mail_user, "@%s", slurmctld_conf.mail_domain); } } else { mail_user = xstrdup(user_name); } return mail_user; } static int _job_fail_account(job_record_t *job_ptr, const char *func_name) { int rc = 0; // Return number of pending jobs held if (IS_JOB_PENDING(job_ptr)) { info("%s: %pJ ineligible due to invalid association", func_name, job_ptr); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_ACCOUNT; if (job_ptr->details) { /* reset the job */ job_ptr->details->accrue_time = 0; job_ptr->bit_flags &= ~JOB_ACCRUE_OVER; job_ptr->details->begin_time = 0; /* Update job with new begin_time. */ jobacct_storage_g_job_start(acct_db_conn, job_ptr); } rc = 1; } /* This job is no longer eligible, so make it so. */ if (job_ptr->assoc_ptr) { part_record_t *tmp_part = job_ptr->part_ptr; List tmp_part_list = job_ptr->part_ptr_list; slurmdb_qos_rec_t *tmp_qos = job_ptr->qos_ptr; /* * Force a start so the association doesn't get lost. Since * there could be some delay in the start of the job when * running with the slurmdbd. */ if (!job_ptr->db_index) jobacct_storage_g_job_start(acct_db_conn, job_ptr); /* * Don't call acct_policy_remove_accrue_time() here, the cnt on * parent associations will be handled correctly by the removal * of the association. */ /* * Clear ptrs so that only association usage is removed. * Otherwise qos and partition limits will be double accounted * for when this job finishes. Don't do this for acrrual time, * it has be on both because the job is ineligible and can't * accrue time. */ job_ptr->part_ptr = NULL; job_ptr->part_ptr_list = NULL; job_ptr->qos_ptr = NULL; acct_policy_remove_job_submit(job_ptr); job_ptr->part_ptr = tmp_part; job_ptr->part_ptr_list = tmp_part_list; job_ptr->qos_ptr = tmp_qos; job_ptr->assoc_ptr = NULL; } job_ptr->assoc_id = 0; return rc; } extern int job_fail_qos(job_record_t *job_ptr, const char *func_name) { int rc = 0; // Return number of pending jobs held if (IS_JOB_PENDING(job_ptr)) { info("%s: %pJ ineligible due to invalid qos", func_name, job_ptr); xfree(job_ptr->state_desc); job_ptr->state_reason = FAIL_QOS; if (job_ptr->details) { /* reset the job */ job_ptr->details->accrue_time = 0; job_ptr->bit_flags &= ~JOB_ACCRUE_OVER; job_ptr->details->begin_time = 0; /* Update job with new begin_time. */ jobacct_storage_g_job_start(acct_db_conn, job_ptr); } rc = 1; } /* This job is no longer eligible, so make it so. */ if (job_ptr->qos_ptr) { slurmdb_assoc_rec_t *tmp_assoc = job_ptr->assoc_ptr; /* * Force a start so the qos doesn't get lost. Since * there could be some delay in the start of the job when * running with the slurmdbd. */ if (!job_ptr->db_index) jobacct_storage_g_job_start(acct_db_conn, job_ptr); /* * Don't call acct_policy_remove_accrue_time() here, the cnt on * parent associations will be handled correctly by the removal * of the association. */ /* * Clear ptrs so that only qos usage is removed. Otherwise * association limits will be double accounted for when this * job finishes. Don't do this for acrrual time, it has be on * both because the job is ineligible and can't accrue time. */ job_ptr->assoc_ptr = NULL; acct_policy_remove_job_submit(job_ptr); job_ptr->assoc_ptr = tmp_assoc; job_ptr->qos_ptr = NULL; } return rc; } /* * Functions used to manage job array responses with a separate return code * possible for each task ID */ /* Add job record to resp_array_struct_t, free with _resp_array_free() */ static void _resp_array_add(resp_array_struct_t **resp, job_record_t *job_ptr, uint32_t rc) { resp_array_struct_t *loc_resp; int array_size; int i; if ((job_ptr->array_task_id == NO_VAL) && (job_ptr->array_recs == NULL)) { error("%s: called for non-job array %pJ", __func__, job_ptr); return; } if (max_array_size == NO_VAL) { max_array_size = slurmctld_conf.max_array_sz; } xassert(resp); if (*resp == NULL) { /* Initialize the data structure */ loc_resp = xmalloc(sizeof(resp_array_struct_t)); loc_resp->resp_array_cnt = 0; loc_resp->resp_array_size = 10; xrealloc(loc_resp->resp_array_rc, (sizeof(uint32_t) * loc_resp->resp_array_size)); xrealloc(loc_resp->resp_array_task_id, (sizeof(bitstr_t *) * loc_resp->resp_array_size)); *resp = loc_resp; } else { loc_resp = *resp; } for (i = 0; i < loc_resp->resp_array_cnt; i++) { if (loc_resp->resp_array_rc[i] != rc) continue; /* Add to existing error code record */ if (job_ptr->array_task_id != NO_VAL) { if (job_ptr->array_task_id < bit_size(loc_resp->resp_array_task_id[i])) { bit_set(loc_resp->resp_array_task_id[i], job_ptr->array_task_id); } else { error("%s: found invalid task id %pJ", __func__, job_ptr); } } else if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) { array_size = bit_size(job_ptr->array_recs-> task_id_bitmap); if (bit_size(loc_resp->resp_array_task_id[i]) != array_size) { loc_resp->resp_array_task_id[i] = bit_realloc( loc_resp->resp_array_task_id[i], array_size); } bit_or(loc_resp->resp_array_task_id[i], job_ptr->array_recs->task_id_bitmap); } else { error("%s: found job %pJ without task ID or bitmap", __func__, job_ptr); } return; } /* Need to add a new record for this error code */ if (loc_resp->resp_array_cnt >= loc_resp->resp_array_size) { /* Need to grow the table size */ loc_resp->resp_array_size += 10; xrealloc(loc_resp->resp_array_rc, (sizeof(uint32_t) * loc_resp->resp_array_size)); xrealloc(loc_resp->resp_array_task_id, (sizeof(bitstr_t *) * loc_resp->resp_array_size)); } loc_resp->resp_array_rc[loc_resp->resp_array_cnt] = rc; if (job_ptr->array_task_id != NO_VAL) { loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] = bit_alloc(max_array_size); if (job_ptr->array_task_id < bit_size(loc_resp->resp_array_task_id [loc_resp->resp_array_cnt])) { bit_set(loc_resp->resp_array_task_id [loc_resp->resp_array_cnt], job_ptr->array_task_id); } } else if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) { loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] = bit_copy(job_ptr->array_recs->task_id_bitmap); } else { error("%s: found %pJ without task ID or bitmap", __func__, job_ptr); loc_resp->resp_array_task_id[loc_resp->resp_array_cnt] = bit_alloc(max_array_size); } loc_resp->resp_array_cnt++; return; } /* Add record to resp_array_struct_t, free with _resp_array_free(). * This is a variant of _resp_array_add for the case where a job/task ID * is not found, so we use a dummy job record based upon the input IDs. */ static void _resp_array_add_id(resp_array_struct_t **resp, uint32_t job_id, uint32_t task_id, uint32_t rc) { job_record_t job_ptr; job_ptr.job_id = job_id; job_ptr.array_job_id = job_id; job_ptr.array_task_id = task_id; job_ptr.array_recs = NULL; _resp_array_add(resp, &job_ptr, rc); } /* Free resp_array_struct_t built by _resp_array_add() */ static void _resp_array_free(resp_array_struct_t *resp) { int i; if (resp) { for (i = 0; i < resp->resp_array_cnt; i++) FREE_NULL_BITMAP(resp->resp_array_task_id[i]); xfree(resp->resp_array_task_id); xfree(resp->resp_array_rc); xfree(resp); } } /* Translate internal job array data structure into a response message */ static job_array_resp_msg_t *_resp_array_xlate(resp_array_struct_t *resp, uint32_t job_id) { job_array_resp_msg_t *msg; char task_str[ARRAY_ID_BUF_SIZE]; int *ffs = NULL; int i, j, low; ffs = xcalloc(resp->resp_array_cnt, sizeof(int)); for (i = 0; i < resp->resp_array_cnt; i++) { ffs[i] = bit_ffs(resp->resp_array_task_id[i]); } msg = xmalloc(sizeof(job_array_resp_msg_t)); msg->job_array_count = resp->resp_array_cnt; msg->job_array_id = xcalloc(resp->resp_array_cnt, sizeof(char *)); msg->error_code = xcalloc(resp->resp_array_cnt, sizeof(uint32_t)); for (i = 0; i < resp->resp_array_cnt; i++) { low = -1; for (j = 0; j < resp->resp_array_cnt; j++) { if ((ffs[j] != -1) && ((low == -1) || (ffs[j] < ffs[low]))) low = j; } if (low == -1) break; ffs[low] = -1; msg->error_code[i] = resp->resp_array_rc[low]; bit_fmt(task_str, ARRAY_ID_BUF_SIZE, resp->resp_array_task_id[low]); if (strlen(task_str) >= ARRAY_ID_BUF_SIZE - 2) { /* Append "..." to the buffer on overflow */ task_str[ARRAY_ID_BUF_SIZE - 4] = '.'; task_str[ARRAY_ID_BUF_SIZE - 3] = '.'; task_str[ARRAY_ID_BUF_SIZE - 2] = '.'; task_str[ARRAY_ID_BUF_SIZE - 1] = '\0'; } xstrfmtcat(msg->job_array_id[i], "%u_%s", job_id, task_str); } xfree(ffs); return msg; } /* * _create_job_record - create an empty job_record including job_details. * load its values with defaults (zeros, nulls, and magic cookie) * IN num_jobs - number of jobs this record should represent * = 0 - split out a job array record to its own job record * = 1 - simple job OR job array with one task * > 1 - job array create with the task count as num_jobs * RET pointer to the record or NULL if error * NOTE: allocates memory that should be xfreed with _list_delete_job */ static job_record_t *_create_job_record(uint32_t num_jobs) { job_record_t *job_ptr = xmalloc(sizeof(*job_ptr)); struct job_details *detail_ptr = xmalloc(sizeof(*detail_ptr)); if ((job_count + num_jobs) >= slurmctld_conf.max_job_cnt) { error("%s: MaxJobCount limit from slurm.conf reached (%u)", __func__, slurmctld_conf.max_job_cnt); } job_count += num_jobs; last_job_update = time(NULL); job_ptr->magic = JOB_MAGIC; job_ptr->array_task_id = NO_VAL; job_ptr->details = detail_ptr; job_ptr->prio_factors = xmalloc(sizeof(priority_factors_object_t)); job_ptr->site_factor = NICE_OFFSET; job_ptr->step_list = list_create(NULL); xassert (detail_ptr->magic = DETAILS_MAGIC); /* set value */ detail_ptr->submit_time = time(NULL); job_ptr->requid = -1; /* force to -1 for sacct to know this * hasn't been set yet */ job_ptr->billable_tres = (double)NO_VAL; (void) list_append(job_list, job_ptr); return job_ptr; } /* * _delete_job_details - delete a job's detail record and clear it's pointer * IN job_entry - pointer to job_record to clear the record of */ static void _delete_job_details(job_record_t *job_entry) { int i; if (job_entry->details == NULL) return; xassert (job_entry->details->magic == DETAILS_MAGIC); /* * Queue up job to have the batch script and environment deleted. * This is handled by a separate thread to limit the amount of * time purge_old_job needs to spend holding locks. */ if (IS_JOB_FINISHED(job_entry)) { uint32_t *job_id = xmalloc(sizeof(uint32_t)); *job_id = job_entry->job_id; list_enqueue(purge_files_list, job_id); } xfree(job_entry->details->acctg_freq); for (i=0; idetails->argc; i++) xfree(job_entry->details->argv[i]); xfree(job_entry->details->argv); xfree(job_entry->details->cpu_bind); FREE_NULL_LIST(job_entry->details->depend_list); xfree(job_entry->details->dependency); xfree(job_entry->details->orig_dependency); for (i=0; idetails->env_cnt; i++) xfree(job_entry->details->env_sup[i]); xfree(job_entry->details->env_sup); xfree(job_entry->details->std_err); FREE_NULL_BITMAP(job_entry->details->exc_node_bitmap); xfree(job_entry->details->exc_nodes); xfree(job_entry->details->extra); FREE_NULL_LIST(job_entry->details->feature_list); xfree(job_entry->details->features); xfree(job_entry->details->cluster_features); xfree(job_entry->details->std_in); xfree(job_entry->details->mc_ptr); xfree(job_entry->details->mem_bind); xfree(job_entry->details->std_out); FREE_NULL_BITMAP(job_entry->details->req_node_bitmap); xfree(job_entry->details->req_nodes); xfree(job_entry->details->work_dir); xfree(job_entry->details->x11_magic_cookie); xfree(job_entry->details->x11_target); xfree(job_entry->details); /* Must be last */ } /* * delete_job_desc_files - delete job descriptor related files * * Note that this will be called on all individual job array tasks, * even though (as of 17.11) individual directories are no longer created. */ extern void delete_job_desc_files(uint32_t job_id) { char *dir_name = NULL, *file_name = NULL; struct stat sbuf; int hash = job_id % 10; DIR *f_dir; struct dirent *dir_ent; dir_name = xstrdup_printf("%s/hash.%d/job.%u", slurmctld_conf.state_save_location, hash, job_id); if (stat(dir_name, &sbuf)) { xfree(dir_name); return; } f_dir = opendir(dir_name); if (f_dir) { while ((dir_ent = readdir(f_dir))) { if (!xstrcmp(dir_ent->d_name, ".") || !xstrcmp(dir_ent->d_name, "..")) continue; xstrfmtcat(file_name, "%s/%s", dir_name, dir_ent->d_name); (void) unlink(file_name); xfree(file_name); } closedir(f_dir); } else { error("opendir(%s): %m", dir_name); } (void) rmdir(dir_name); xfree(dir_name); } static uint32_t _max_switch_wait(uint32_t input_wait) { static time_t sched_update = 0; static uint32_t max_wait = 300; /* default max_switch_wait, seconds */ char *sched_params, *tmp_ptr; int i; if (sched_update != slurmctld_conf.last_update) { sched_update = slurmctld_conf.last_update; sched_params = slurm_get_sched_params(); if ((tmp_ptr = xstrcasestr(sched_params, "max_switch_wait="))) { /* 0123456789012345 */ i = atoi(tmp_ptr + 16); if (i < 0) { error("ignoring SchedulerParameters: " "max_switch_wait of %d", i); } else { max_wait = i; } } xfree(sched_params); } if (max_wait > input_wait) return input_wait; return max_wait; } static slurmdb_qos_rec_t *_determine_and_validate_qos( char *resv_name, slurmdb_assoc_rec_t *assoc_ptr, bool operator, slurmdb_qos_rec_t *qos_rec, int *error_code, bool locked, log_level_t log_lvl) { slurmdb_qos_rec_t *qos_ptr = NULL; /* If enforcing associations make sure this is a valid qos with the association. If not just fill in the qos and continue. */ xassert(qos_rec); assoc_mgr_get_default_qos_info(assoc_ptr, qos_rec); if (assoc_mgr_fill_in_qos(acct_db_conn, qos_rec, accounting_enforce, &qos_ptr, locked) != SLURM_SUCCESS) { log_var(log_lvl, "Invalid qos (%s)", qos_rec->name); *error_code = ESLURM_INVALID_QOS; return NULL; } if ((accounting_enforce & ACCOUNTING_ENFORCE_QOS) && assoc_ptr && !operator && (!assoc_ptr->usage->valid_qos || !bit_test(assoc_ptr->usage->valid_qos, qos_rec->id))) { log_var(log_lvl, "This association %d(account='%s', user='%s', partition='%s') does not have access to qos %s", assoc_ptr->id, assoc_ptr->acct, assoc_ptr->user, assoc_ptr->partition, qos_rec->name); *error_code = ESLURM_INVALID_QOS; return NULL; } if (qos_ptr && (qos_ptr->flags & QOS_FLAG_REQ_RESV) && (!resv_name || resv_name[0] == '\0')) { log_var(log_lvl, "qos %s can only be used in a reservation", qos_rec->name); *error_code = ESLURM_INVALID_QOS; return NULL; } *error_code = SLURM_SUCCESS; return qos_ptr; } /* * dump_all_job_state - save the state of all jobs to file for checkpoint * Changes here should be reflected in load_last_job_id() and * load_all_job_state(). * RET 0 or error code */ int dump_all_job_state(void) { /* Save high-water mark to avoid buffer growth with copies */ static int high_buffer_size = (1024 * 1024); int error_code = SLURM_SUCCESS, log_fd; char *old_file, *new_file, *reg_file; struct stat stat_buf; /* Locks: Read config and job */ slurmctld_lock_t job_read_lock = { READ_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; ListIterator job_iterator; job_record_t *job_ptr; Buf buffer = init_buf(high_buffer_size); time_t now = time(NULL); time_t last_state_file_time; DEF_TIMERS; START_TIMER; /* * Check that last state file was written at expected time. * This is a check for two slurmctld daemons running at the same * time in primary mode (a split-brain problem). */ last_state_file_time = _get_last_job_state_write_time(); if (last_file_write_time && last_state_file_time && (last_file_write_time != last_state_file_time)) { error("Bad job state save file time. We wrote it at time %u, " "but the file contains a time stamp of %u.", (uint32_t) last_file_write_time, (uint32_t) last_state_file_time); if (slurmctld_primary == 0) { fatal("Two slurmctld daemons are running as primary. " "Shutting down this daemon to avoid inconsistent " "state due to split brain."); } } /* write header: version, time */ packstr(JOB_STATE_VERSION, buffer); pack16(SLURM_PROTOCOL_VERSION, buffer); pack_time(now, buffer); /* * write header: job id * This is needed so that the job id remains persistent even after * slurmctld is restarted. */ pack32( job_id_sequence, buffer); debug3("Writing job id %u to header record of job_state file", job_id_sequence); /* write individual job records */ lock_slurmctld(job_read_lock); job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { _dump_job_state(job_ptr, buffer); } list_iterator_destroy(job_iterator); /* write the buffer to file */ old_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(old_file, "/job_state.old"); reg_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(reg_file, "/job_state"); new_file = xstrdup(slurmctld_conf.state_save_location); xstrcat(new_file, "/job_state.new"); unlock_slurmctld(job_read_lock); if (stat(reg_file, &stat_buf) == 0) { static time_t last_mtime = (time_t) 0; int delta_t = difftime(stat_buf.st_mtime, last_mtime); if (delta_t < -10) { error("The modification time of %s moved backwards " "by %d seconds", reg_file, (0-delta_t)); error("The clock of the file system and this computer " "appear to not be synchronized"); /* It could be safest to exit here. We likely mounted * a different file system with the state save files */ } last_mtime = time(NULL); } lock_state_files(); log_fd = open(new_file, O_CREAT|O_WRONLY|O_TRUNC|O_CLOEXEC, 0600); if (log_fd < 0) { error("Can't save state, create file %s error %m", new_file); error_code = errno; } else { int pos = 0, nwrite, amount, rc; char *data; nwrite = get_buf_offset(buffer); data = (char *)get_buf_data(buffer); high_buffer_size = MAX(nwrite, high_buffer_size); while (nwrite > 0) { amount = write(log_fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("Error writing file %s, %m", new_file); error_code = errno; break; } nwrite -= amount; pos += amount; } rc = fsync_and_close(log_fd, "job"); if (rc && !error_code) error_code = rc; } if (error_code) (void) unlink(new_file); else { /* file shuffle */ (void) unlink(old_file); if (link(reg_file, old_file)) debug4("unable to create link for %s -> %s: %m", reg_file, old_file); (void) unlink(reg_file); if (link(new_file, reg_file)) debug4("unable to create link for %s -> %s: %m", new_file, reg_file); (void) unlink(new_file); last_file_write_time = now; } xfree(old_file); xfree(reg_file); xfree(new_file); unlock_state_files(); free_buf(buffer); END_TIMER2("dump_all_job_state"); return error_code; } static int _find_resv_part(void *x, void *key) { slurmctld_resv_t *resv_ptr = (slurmctld_resv_t *) x; if (resv_ptr->part_ptr != (part_record_t *) key) return 0; else return 1; /* match */ } /* Open the job state save file, or backup if necessary. * state_file IN - the name of the state save file used * RET the file description to read from or error code */ static Buf _open_job_state_file(char **state_file) { Buf buf; xassert(state_file); xassert(!*state_file); *state_file = xstrdup_printf("%s/job_state", slurmctld_conf.state_save_location); if (!(buf = create_mmap_buf(*state_file))) error("Could not open job state file %s: %m", *state_file); else return buf; error("NOTE: Trying backup state save file. Jobs may be lost!"); xstrcat(*state_file, ".old"); return create_mmap_buf(*state_file); } extern void set_job_failed_assoc_qos_ptr(job_record_t *job_ptr) { if (!job_ptr->assoc_ptr && (job_ptr->state_reason == FAIL_ACCOUNT)) { slurmdb_assoc_rec_t assoc_rec; memset(&assoc_rec, 0, sizeof(assoc_rec)); /* * For speed and accurracy we will first see if we once had an * association record. If not look for it by * account,partition, user_id. */ if (job_ptr->assoc_id) assoc_rec.id = job_ptr->assoc_id; else { assoc_rec.acct = job_ptr->account; if (job_ptr->part_ptr) assoc_rec.partition = job_ptr->part_ptr->name; assoc_rec.uid = job_ptr->user_id; } if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, &job_ptr->assoc_ptr, false) == SLURM_SUCCESS) { job_ptr->assoc_id = assoc_rec.id; debug("%s: Filling in assoc for %pJ Assoc=%u", __func__, job_ptr, job_ptr->assoc_id); job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); last_job_update = time(NULL); } } if (!job_ptr->qos_ptr && (job_ptr->state_reason == FAIL_QOS)) { int qos_error = SLURM_SUCCESS; slurmdb_qos_rec_t qos_rec; memset(&qos_rec, 0, sizeof(qos_rec)); qos_rec.id = job_ptr->qos_id; job_ptr->qos_ptr = _determine_and_validate_qos( job_ptr->resv_name, job_ptr->assoc_ptr, job_ptr->limit_set.qos, &qos_rec, &qos_error, false, LOG_LEVEL_DEBUG2); if ((qos_error == SLURM_SUCCESS) && job_ptr->qos_ptr) { debug("%s: Filling in QOS for %pJ QOS=%s(%u)", __func__, job_ptr, qos_rec.name, job_ptr->qos_id); job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); last_job_update = time(NULL); } } } extern void set_job_tres_req_str(job_record_t *job_ptr, bool assoc_mgr_locked) { assoc_mgr_lock_t locks = { .tres = READ_LOCK }; xassert(job_ptr); if (!assoc_mgr_locked) assoc_mgr_lock(&locks); xfree(job_ptr->tres_req_str); job_ptr->tres_req_str = assoc_mgr_make_tres_str_from_array( job_ptr->tres_req_cnt, TRES_STR_FLAG_SIMPLE, true); xfree(job_ptr->tres_fmt_req_str); job_ptr->tres_fmt_req_str = assoc_mgr_make_tres_str_from_array( job_ptr->tres_req_cnt, TRES_STR_CONVERT_UNITS, true); if (!assoc_mgr_locked) assoc_mgr_unlock(&locks); } extern void set_job_tres_alloc_str(job_record_t *job_ptr, bool assoc_mgr_locked) { assoc_mgr_lock_t locks = { .tres = READ_LOCK }; xassert(job_ptr); if (!assoc_mgr_locked) assoc_mgr_lock(&locks); xfree(job_ptr->tres_alloc_str); job_ptr->tres_alloc_str = assoc_mgr_make_tres_str_from_array( job_ptr->tres_alloc_cnt, TRES_STR_FLAG_SIMPLE, true); xfree(job_ptr->tres_fmt_alloc_str); job_ptr->tres_fmt_alloc_str = assoc_mgr_make_tres_str_from_array( job_ptr->tres_alloc_cnt, TRES_STR_CONVERT_UNITS, true); if (!assoc_mgr_locked) assoc_mgr_unlock(&locks); } /* Note that the backup slurmctld has assumed primary control. * This function can be called multiple times. */ extern void backup_slurmctld_restart(void) { last_file_write_time = (time_t) 0; } /* Return the time stamp in the current job state save file, 0 is returned on * error */ static time_t _get_last_job_state_write_time(void) { int error_code = SLURM_SUCCESS; char *state_file = NULL; Buf buffer; time_t buf_time = (time_t) 0; char *ver_str = NULL; uint32_t ver_str_len; uint16_t protocol_version = NO_VAL16; /* read the file */ if (!(buffer = _open_job_state_file(&state_file))) { info("No job state file (%s) found", state_file); error_code = ENOENT; } xfree(state_file); if (error_code) return buf_time; safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION)) safe_unpack16(&protocol_version, buffer); safe_unpack_time(&buf_time, buffer); unpack_error: xfree(ver_str); free_buf(buffer); return buf_time; } /* * load_all_job_state - load the job state from file, recover from last * checkpoint. Execute this after loading the configuration file data. * Changes here should be reflected in load_last_job_id(). * RET 0 or error code */ extern int load_all_job_state(void) { int error_code = SLURM_SUCCESS; int job_cnt = 0; char *state_file = NULL; Buf buffer; time_t buf_time; uint32_t saved_job_id; char *ver_str = NULL; uint32_t ver_str_len; uint16_t protocol_version = NO_VAL16; /* read the file */ lock_state_files(); if (!(buffer = _open_job_state_file(&state_file))) { info("No job state file (%s) to recover", state_file); xfree(state_file); unlock_state_files(); return ENOENT; } xfree(state_file); unlock_state_files(); job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); debug3("Version string in job_state header is %s", ver_str); if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION)) safe_unpack16(&protocol_version, buffer); xfree(ver_str); if (protocol_version == NO_VAL16) { if (!ignore_state_errors) fatal("Can not recover job state, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); error("***********************************************"); error("Can not recover job state, incompatible version"); error("***********************************************"); free_buf(buffer); return EFAULT; } safe_unpack_time(&buf_time, buffer); safe_unpack32(&saved_job_id, buffer); if (saved_job_id <= slurmctld_conf.max_job_id) job_id_sequence = MAX(saved_job_id, job_id_sequence); debug3("Job id in job_state header is %u", saved_job_id); /* * Previously we locked the tres read lock before this loop. It turned * out that created a double lock when steps were being loaded during * the calls to jobacctinfo_create() which also locks the read lock. * It ended up being much easier to move the locks for the assoc_mgr * into the _load_job_state function than any other option. */ while (remaining_buf(buffer) > 0) { error_code = _load_job_state(buffer, protocol_version); if (error_code != SLURM_SUCCESS) goto unpack_error; job_cnt++; } debug3("Set job_id_sequence to %u", job_id_sequence); free_buf(buffer); info("Recovered information about %d jobs", job_cnt); return error_code; unpack_error: if (!ignore_state_errors) fatal("Incomplete job state save file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); error("Incomplete job state save file"); info("Recovered information about %d jobs", job_cnt); free_buf(buffer); return SLURM_ERROR; } /* * load_last_job_id - load only the last job ID from state save file. * Changes here should be reflected in load_all_job_state(). * RET 0 or error code */ extern int load_last_job_id( void ) { char *state_file = NULL; Buf buffer; time_t buf_time; char *ver_str = NULL; uint32_t ver_str_len; uint16_t protocol_version = NO_VAL16; /* read the file */ lock_state_files(); if (!(buffer = _open_job_state_file(&state_file))) { debug("No job state file (%s) to recover", state_file); xfree(state_file); unlock_state_files(); return ENOENT; } xfree(state_file); unlock_state_files(); safe_unpackstr_xmalloc(&ver_str, &ver_str_len, buffer); debug3("Version string in job_state header is %s", ver_str); if (ver_str && !xstrcmp(ver_str, JOB_STATE_VERSION)) safe_unpack16(&protocol_version, buffer); xfree(ver_str); if (protocol_version == NO_VAL16) { if (!ignore_state_errors) fatal("Can not recover last job ID, incompatible version, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); debug("*************************************************"); debug("Can not recover last job ID, incompatible version"); debug("*************************************************"); free_buf(buffer); return EFAULT; } safe_unpack_time(&buf_time, buffer); safe_unpack32( &job_id_sequence, buffer); debug3("Job ID in job_state header is %u", job_id_sequence); /* Ignore the state for individual jobs stored here */ xfree(ver_str); free_buf(buffer); return SLURM_SUCCESS; unpack_error: if (!ignore_state_errors) fatal("Invalid job data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered."); error("Invalid job data checkpoint file"); xfree(ver_str); free_buf(buffer); return SLURM_ERROR; } static void _pack_acct_policy_limit(acct_policy_limit_set_t *limit_set, Buf buffer, uint16_t protocol_version) { xassert(limit_set); pack16(limit_set->qos, buffer); pack16(limit_set->time, buffer); pack16_array(limit_set->tres, slurmctld_tres_cnt, buffer); } static int _unpack_acct_policy_limit_members( acct_policy_limit_set_t *limit_set, Buf buffer, uint16_t protocol_version) { uint32_t tmp32; xassert(limit_set); safe_unpack16(&limit_set->qos, buffer); safe_unpack16(&limit_set->time, buffer); xfree(limit_set->tres); safe_unpack16_array(&limit_set->tres, &tmp32, buffer); /* * Because the tres array could have grown or the tres could have moved * positions, the array needs to be rebuilt and the old values need to * be copied into their new spots. */ if ((tmp32 < slurmctld_tres_cnt) || assoc_mgr_tres_pos_changed()) update_job_limit_set_tres(&limit_set->tres); return SLURM_SUCCESS; unpack_error: xfree(limit_set->tres); return SLURM_ERROR; } /* * _dump_job_state - dump the state of a specific job, its details, and * steps to a buffer * IN dump_job_ptr - pointer to job for which information is requested * IN/OUT buffer - location to store data, pointers automatically advanced */ static void _dump_job_state(job_record_t *dump_job_ptr, Buf buffer) { struct job_details *detail_ptr; uint32_t tmp_32; xassert(dump_job_ptr->magic == JOB_MAGIC); /* Don't pack "unlinked" job. */ if (dump_job_ptr->job_id == NO_VAL) return; /* Dump basic job info */ pack32(dump_job_ptr->array_job_id, buffer); pack32(dump_job_ptr->array_task_id, buffer); if (dump_job_ptr->array_recs) { build_array_str(dump_job_ptr); if (dump_job_ptr->array_recs->task_id_bitmap) { tmp_32 = bit_size(dump_job_ptr->array_recs-> task_id_bitmap); } else tmp_32 = 0; pack32(tmp_32, buffer); if (tmp_32) packstr(dump_job_ptr->array_recs->task_id_str, buffer); pack32(dump_job_ptr->array_recs->array_flags, buffer); pack32(dump_job_ptr->array_recs->max_run_tasks, buffer); pack32(dump_job_ptr->array_recs->tot_run_tasks, buffer); pack32(dump_job_ptr->array_recs->min_exit_code, buffer); pack32(dump_job_ptr->array_recs->max_exit_code, buffer); pack32(dump_job_ptr->array_recs->tot_comp_tasks, buffer); } else { tmp_32 = NO_VAL; pack32(tmp_32, buffer); } pack32(dump_job_ptr->assoc_id, buffer); packstr(dump_job_ptr->batch_features, buffer); pack32(dump_job_ptr->delay_boot, buffer); pack32(dump_job_ptr->job_id, buffer); pack32(dump_job_ptr->user_id, buffer); pack32(dump_job_ptr->group_id, buffer); pack32(dump_job_ptr->time_limit, buffer); pack32(dump_job_ptr->time_min, buffer); pack32(dump_job_ptr->priority, buffer); pack32(dump_job_ptr->alloc_sid, buffer); pack32(dump_job_ptr->total_cpus, buffer); if (dump_job_ptr->total_nodes) pack32(dump_job_ptr->total_nodes, buffer); else pack32(dump_job_ptr->node_cnt_wag, buffer); pack32(dump_job_ptr->cpu_cnt, buffer); pack32(dump_job_ptr->exit_code, buffer); pack32(dump_job_ptr->derived_ec, buffer); pack64(dump_job_ptr->db_index, buffer); pack32(dump_job_ptr->resv_id, buffer); pack32(dump_job_ptr->next_step_id, buffer); pack32(dump_job_ptr->het_job_id, buffer); packstr(dump_job_ptr->het_job_id_set, buffer); pack32(dump_job_ptr->het_job_offset, buffer); pack32(dump_job_ptr->qos_id, buffer); pack32(dump_job_ptr->req_switch, buffer); pack32(dump_job_ptr->wait4switch, buffer); pack32(dump_job_ptr->profile, buffer); pack32(dump_job_ptr->db_flags, buffer); pack_time(dump_job_ptr->last_sched_eval, buffer); pack_time(dump_job_ptr->preempt_time, buffer); pack_time(dump_job_ptr->start_time, buffer); pack_time(dump_job_ptr->end_time, buffer); pack_time(dump_job_ptr->end_time_exp, buffer); pack_time(dump_job_ptr->suspend_time, buffer); pack_time(dump_job_ptr->pre_sus_time, buffer); pack_time(dump_job_ptr->resize_time, buffer); pack_time(dump_job_ptr->tot_sus_time, buffer); pack_time(dump_job_ptr->deadline, buffer); pack32(dump_job_ptr->site_factor, buffer); pack16(dump_job_ptr->direct_set_prio, buffer); pack32(dump_job_ptr->job_state, buffer); pack16(dump_job_ptr->kill_on_node_fail, buffer); pack16(dump_job_ptr->batch_flag, buffer); pack16(dump_job_ptr->mail_type, buffer); pack32(dump_job_ptr->state_reason, buffer); pack32(dump_job_ptr->state_reason_prev_db, buffer); pack8(dump_job_ptr->reboot, buffer); pack16(dump_job_ptr->restart_cnt, buffer); pack16(dump_job_ptr->wait_all_nodes, buffer); pack16(dump_job_ptr->warn_flags, buffer); pack16(dump_job_ptr->warn_signal, buffer); pack16(dump_job_ptr->warn_time, buffer); _pack_acct_policy_limit(&dump_job_ptr->limit_set, buffer, SLURM_PROTOCOL_VERSION); packstr(dump_job_ptr->state_desc, buffer); packstr(dump_job_ptr->resp_host, buffer); pack16(dump_job_ptr->alloc_resp_port, buffer); pack16(dump_job_ptr->other_port, buffer); pack8(dump_job_ptr->power_flags, buffer); pack16(dump_job_ptr->start_protocol_ver, buffer); packdouble(dump_job_ptr->billable_tres, buffer); if (IS_JOB_COMPLETING(dump_job_ptr)) { if (dump_job_ptr->nodes_completing == NULL) { dump_job_ptr->nodes_completing = bitmap2node_name(dump_job_ptr->node_bitmap); } packstr(dump_job_ptr->nodes_completing, buffer); } packstr(dump_job_ptr->nodes, buffer); packstr(dump_job_ptr->partition, buffer); packstr(dump_job_ptr->name, buffer); packstr(dump_job_ptr->user_name, buffer); packstr(dump_job_ptr->wckey, buffer); packstr(dump_job_ptr->alloc_node, buffer); packstr(dump_job_ptr->account, buffer); packstr(dump_job_ptr->admin_comment, buffer); packstr(dump_job_ptr->comment, buffer); packstr(dump_job_ptr->gres_alloc, buffer); packstr(dump_job_ptr->gres_req, buffer); packstr(dump_job_ptr->gres_used, buffer); packstr(dump_job_ptr->network, buffer); packstr(dump_job_ptr->licenses, buffer); packstr(dump_job_ptr->mail_user, buffer); packstr(dump_job_ptr->mcs_label, buffer); packstr(dump_job_ptr->resv_name, buffer); packstr(dump_job_ptr->batch_host, buffer); packstr(dump_job_ptr->burst_buffer, buffer); packstr(dump_job_ptr->burst_buffer_state, buffer); packstr(dump_job_ptr->system_comment, buffer); select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, buffer, SLURM_PROTOCOL_VERSION); pack_job_resources(dump_job_ptr->job_resrcs, buffer, SLURM_PROTOCOL_VERSION); packstr_array(dump_job_ptr->spank_job_env, dump_job_ptr->spank_job_env_size, buffer); (void) gres_plugin_job_state_pack(dump_job_ptr->gres_list, buffer, dump_job_ptr->job_id, true, SLURM_PROTOCOL_VERSION); /* Dump job details, if available */ detail_ptr = dump_job_ptr->details; if (detail_ptr) { xassert (detail_ptr->magic == DETAILS_MAGIC); pack16((uint16_t) DETAILS_FLAG, buffer); _dump_job_details(detail_ptr, buffer); } else pack16((uint16_t) 0, buffer); /* no details flag */ /* Dump job steps */ list_for_each(dump_job_ptr->step_list, dump_job_step_state, buffer); pack16((uint16_t) 0, buffer); /* no step flag */ pack32(dump_job_ptr->bit_flags, buffer); packstr(dump_job_ptr->tres_alloc_str, buffer); packstr(dump_job_ptr->tres_fmt_alloc_str, buffer); packstr(dump_job_ptr->tres_req_str, buffer); packstr(dump_job_ptr->tres_fmt_req_str, buffer); packstr(dump_job_ptr->clusters, buffer); _dump_job_fed_details(dump_job_ptr->fed_details, buffer); packstr(dump_job_ptr->origin_cluster, buffer); packstr(dump_job_ptr->cpus_per_tres, buffer); packstr(dump_job_ptr->mem_per_tres, buffer); packstr(dump_job_ptr->tres_bind, buffer); packstr(dump_job_ptr->tres_freq, buffer); packstr(dump_job_ptr->tres_per_job, buffer); packstr(dump_job_ptr->tres_per_node, buffer); packstr(dump_job_ptr->tres_per_socket, buffer); packstr(dump_job_ptr->tres_per_task, buffer); } /* Unpack a job's state information from a buffer */ /* NOTE: assoc_mgr qos, tres and assoc read lock must be unlocked before * calling */ static int _load_job_state(Buf buffer, uint16_t protocol_version) { uint64_t db_index; uint32_t job_id, user_id, group_id, time_limit, priority, alloc_sid; uint32_t exit_code, assoc_id, name_len, time_min; uint32_t next_step_id, total_cpus, total_nodes = 0, cpu_cnt; uint32_t resv_id, spank_job_env_size = 0, qos_id, derived_ec = 0; uint32_t array_job_id = 0, req_switch = 0, wait4switch = 0; uint32_t profile = ACCT_GATHER_PROFILE_NOT_SET, db_flags = 0; uint32_t job_state, delay_boot = 0, site_factor = NICE_OFFSET; time_t start_time, end_time, end_time_exp, suspend_time, pre_sus_time, tot_sus_time; time_t preempt_time = 0, deadline = 0; time_t last_sched_eval = 0; time_t resize_time = 0, now = time(NULL); uint8_t reboot = 0, power_flags = 0; uint32_t array_task_id = NO_VAL, state_reason_prev_db = 0; uint32_t array_flags = 0, max_run_tasks = 0, tot_run_tasks = 0; uint32_t min_exit_code = 0, max_exit_code = 0, tot_comp_tasks = 0; uint32_t het_job_id = 0, het_job_offset = 0, state_reason; uint16_t details, batch_flag, step_flag; uint16_t kill_on_node_fail, direct_set_prio; uint16_t alloc_resp_port, other_port, mail_type, tmp16; uint16_t restart_cnt; uint16_t wait_all_nodes, warn_flags = 0, warn_signal, warn_time; acct_policy_limit_set_t limit_set; uint16_t start_protocol_ver = SLURM_MIN_PROTOCOL_VERSION; char *nodes = NULL, *partition = NULL, *name = NULL, *resp_host = NULL; char *account = NULL, *network = NULL, *mail_user = NULL; char *comment = NULL, *nodes_completing = NULL, *alloc_node = NULL; char *licenses = NULL, *state_desc = NULL, *wckey = NULL; char *resv_name = NULL, *batch_host = NULL; char *gres_alloc = NULL, *gres_req = NULL, *gres_used = NULL; char *burst_buffer = NULL, *burst_buffer_state = NULL; char *admin_comment = NULL, *task_id_str = NULL, *mcs_label = NULL; char *clusters = NULL, *het_job_id_set = NULL, *user_name = NULL; char *batch_features = NULL, *system_comment = NULL; uint32_t task_id_size = NO_VAL; char **spank_job_env = (char **) NULL; List gres_list = NULL, part_ptr_list = NULL; job_record_t *job_ptr = NULL; part_record_t *part_ptr; int error_code, i, qos_error, rc; dynamic_plugin_data_t *select_jobinfo = NULL; job_resources_t *job_resources = NULL; slurmdb_assoc_rec_t assoc_rec; slurmdb_qos_rec_t qos_rec; bool job_finished = false; double billable_tres = (double)NO_VAL; char *tres_alloc_str = NULL, *tres_fmt_alloc_str = NULL, *tres_req_str = NULL, *tres_fmt_req_str = NULL; uint32_t pelog_env_size = 0; char **pelog_env = (char **) NULL; job_fed_details_t *job_fed_details = NULL; assoc_mgr_lock_t locks = { .assoc = READ_LOCK, .qos = READ_LOCK, .tres = READ_LOCK, .user = READ_LOCK }; memset(&limit_set, 0, sizeof(limit_set)); limit_set.tres = xcalloc(slurmctld_tres_cnt, sizeof(uint16_t)); if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) { safe_unpack32(&array_job_id, buffer); safe_unpack32(&array_task_id, buffer); /* Job Array record */ safe_unpack32(&task_id_size, buffer); if (task_id_size != NO_VAL) { if (task_id_size) { safe_unpackstr_xmalloc(&task_id_str, &name_len, buffer); } safe_unpack32(&array_flags, buffer); safe_unpack32(&max_run_tasks, buffer); safe_unpack32(&tot_run_tasks, buffer); safe_unpack32(&min_exit_code, buffer); safe_unpack32(&max_exit_code, buffer); safe_unpack32(&tot_comp_tasks, buffer); } safe_unpack32(&assoc_id, buffer); safe_unpackstr_xmalloc(&batch_features, &name_len, buffer); safe_unpack32(&delay_boot, buffer); safe_unpack32(&job_id, buffer); /* validity test as possible */ if (job_id == 0) { verbose("Invalid job_id %u", job_id); goto unpack_error; } job_ptr = find_job_record(job_id); if (job_ptr == NULL) { job_ptr = _create_job_record(1); if (!job_ptr) { error("Create job entry failed for JobId=%u", job_id); goto unpack_error; } job_ptr->job_id = job_id; job_ptr->array_job_id = array_job_id; job_ptr->array_task_id = array_task_id; } safe_unpack32(&user_id, buffer); safe_unpack32(&group_id, buffer); safe_unpack32(&time_limit, buffer); safe_unpack32(&time_min, buffer); safe_unpack32(&priority, buffer); safe_unpack32(&alloc_sid, buffer); safe_unpack32(&total_cpus, buffer); safe_unpack32(&total_nodes, buffer); safe_unpack32(&cpu_cnt, buffer); safe_unpack32(&exit_code, buffer); safe_unpack32(&derived_ec, buffer); safe_unpack64(&db_index, buffer); safe_unpack32(&resv_id, buffer); safe_unpack32(&next_step_id, buffer); safe_unpack32(&het_job_id, buffer); safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer); safe_unpack32(&het_job_offset, buffer); safe_unpack32(&qos_id, buffer); safe_unpack32(&req_switch, buffer); safe_unpack32(&wait4switch, buffer); safe_unpack32(&profile, buffer); safe_unpack32(&db_flags, buffer); safe_unpack_time(&last_sched_eval, buffer); safe_unpack_time(&preempt_time, buffer); safe_unpack_time(&start_time, buffer); safe_unpack_time(&end_time, buffer); safe_unpack_time(&end_time_exp, buffer); safe_unpack_time(&suspend_time, buffer); safe_unpack_time(&pre_sus_time, buffer); safe_unpack_time(&resize_time, buffer); safe_unpack_time(&tot_sus_time, buffer); safe_unpack_time(&deadline, buffer); safe_unpack32(&site_factor, buffer); safe_unpack16(&direct_set_prio, buffer); safe_unpack32(&job_state, buffer); safe_unpack16(&kill_on_node_fail, buffer); safe_unpack16(&batch_flag, buffer); safe_unpack16(&mail_type, buffer); safe_unpack32(&state_reason, buffer); safe_unpack32(&state_reason_prev_db, buffer); safe_unpack8 (&reboot, buffer); safe_unpack16(&restart_cnt, buffer); safe_unpack16(&wait_all_nodes, buffer); safe_unpack16(&warn_flags, buffer); safe_unpack16(&warn_signal, buffer); safe_unpack16(&warn_time, buffer); _unpack_acct_policy_limit_members(&limit_set, buffer, protocol_version); safe_unpackstr_xmalloc(&state_desc, &name_len, buffer); safe_unpackstr_xmalloc(&resp_host, &name_len, buffer); safe_unpack16(&alloc_resp_port, buffer); safe_unpack16(&other_port, buffer); safe_unpack8(&power_flags, buffer); safe_unpack16(&start_protocol_ver, buffer); safe_unpackdouble(&billable_tres, buffer); if (job_state & JOB_COMPLETING) { safe_unpackstr_xmalloc(&nodes_completing, &name_len, buffer); } safe_unpackstr_xmalloc(&nodes, &name_len, buffer); safe_unpackstr_xmalloc(&partition, &name_len, buffer); if (partition == NULL) { error("No partition for JobId=%u", job_id); goto unpack_error; } part_ptr = find_part_record (partition); if (part_ptr == NULL) { char *err_part = NULL; part_ptr_list = get_part_list(partition, &err_part); if (part_ptr_list) { part_ptr = list_peek(part_ptr_list); if (list_count(part_ptr_list) == 1) FREE_NULL_LIST(part_ptr_list); } else { verbose("Invalid partition (%s) for JobId=%u", err_part, job_id); xfree(err_part); /* not fatal error, partition could have been * removed, reset_job_bitmaps() will clean-up * this job */ } } safe_unpackstr_xmalloc(&name, &name_len, buffer); safe_unpackstr_xmalloc(&user_name, &name_len, buffer); safe_unpackstr_xmalloc(&wckey, &name_len, buffer); safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer); safe_unpackstr_xmalloc(&account, &name_len, buffer); safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer); safe_unpackstr_xmalloc(&comment, &name_len, buffer); safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer); safe_unpackstr_xmalloc(&gres_req, &name_len, buffer); safe_unpackstr_xmalloc(&gres_used, &name_len, buffer); safe_unpackstr_xmalloc(&network, &name_len, buffer); safe_unpackstr_xmalloc(&licenses, &name_len, buffer); safe_unpackstr_xmalloc(&mail_user, &name_len, buffer); safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer); safe_unpackstr_xmalloc(&resv_name, &name_len, buffer); safe_unpackstr_xmalloc(&batch_host, &name_len, buffer); safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer); safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer); safe_unpackstr_xmalloc(&system_comment, &name_len, buffer); if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, protocol_version)) goto unpack_error; if (unpack_job_resources(&job_resources, buffer, protocol_version)) goto unpack_error; safe_unpackstr_array(&spank_job_env, &spank_job_env_size, buffer); if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id, protocol_version) != SLURM_SUCCESS) goto unpack_error; gres_plugin_job_state_log(gres_list, job_id); safe_unpack16(&details, buffer); if ((details == DETAILS_FLAG) && (_load_job_details(job_ptr, buffer, protocol_version))) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_SYSTEM; xfree(job_ptr->state_desc); job_ptr->end_time = now; goto unpack_error; } safe_unpack16(&step_flag, buffer); while (step_flag == STEP_FLAG) { /* * No need to put these into accounting if they * haven't been since all information will be * put in when the job is finished. */ if ((error_code = load_step_state(job_ptr, buffer, protocol_version))) goto unpack_error; safe_unpack16(&step_flag, buffer); } safe_unpack32(&job_ptr->bit_flags, buffer); job_ptr->bit_flags &= ~BACKFILL_TEST; job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST; safe_unpackstr_xmalloc(&tres_alloc_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_fmt_alloc_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer); safe_unpackstr_xmalloc(&clusters, &name_len, buffer); if ((error_code = _load_job_fed_details(&job_fed_details, buffer, protocol_version))) goto unpack_error; safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len, buffer); } else if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) { uint16_t uint16_tmp; safe_unpack32(&array_job_id, buffer); safe_unpack32(&array_task_id, buffer); /* Job Array record */ safe_unpack32(&task_id_size, buffer); if (task_id_size != NO_VAL) { if (task_id_size) { safe_unpackstr_xmalloc(&task_id_str, &name_len, buffer); } safe_unpack32(&array_flags, buffer); safe_unpack32(&max_run_tasks, buffer); safe_unpack32(&tot_run_tasks, buffer); safe_unpack32(&min_exit_code, buffer); safe_unpack32(&max_exit_code, buffer); safe_unpack32(&tot_comp_tasks, buffer); } safe_unpack32(&assoc_id, buffer); safe_unpackstr_xmalloc(&batch_features, &name_len, buffer); safe_unpack32(&delay_boot, buffer); safe_unpack32(&job_id, buffer); /* validity test as possible */ if (job_id == 0) { verbose("Invalid job_id %u", job_id); goto unpack_error; } job_ptr = find_job_record(job_id); if (job_ptr == NULL) { job_ptr = _create_job_record(1); if (!job_ptr) { error("Create job entry failed for JobId=%u", job_id); goto unpack_error; } job_ptr->job_id = job_id; job_ptr->array_job_id = array_job_id; job_ptr->array_task_id = array_task_id; } safe_unpack32(&user_id, buffer); safe_unpack32(&group_id, buffer); safe_unpack32(&time_limit, buffer); safe_unpack32(&time_min, buffer); safe_unpack32(&priority, buffer); safe_unpack32(&alloc_sid, buffer); safe_unpack32(&total_cpus, buffer); safe_unpack32(&total_nodes, buffer); safe_unpack32(&cpu_cnt, buffer); safe_unpack32(&exit_code, buffer); safe_unpack32(&derived_ec, buffer); safe_unpack64(&db_index, buffer); safe_unpack32(&resv_id, buffer); safe_unpack32(&next_step_id, buffer); safe_unpack32(&het_job_id, buffer); safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer); safe_unpack32(&het_job_offset, buffer); safe_unpack32(&qos_id, buffer); safe_unpack32(&req_switch, buffer); safe_unpack32(&wait4switch, buffer); safe_unpack32(&profile, buffer); safe_unpack32(&db_flags, buffer); safe_unpack_time(&last_sched_eval, buffer); safe_unpack_time(&preempt_time, buffer); safe_unpack_time(&start_time, buffer); safe_unpack_time(&end_time, buffer); safe_unpack_time(&end_time_exp, buffer); safe_unpack_time(&suspend_time, buffer); safe_unpack_time(&pre_sus_time, buffer); safe_unpack_time(&resize_time, buffer); safe_unpack_time(&tot_sus_time, buffer); safe_unpack_time(&deadline, buffer); safe_unpack32(&site_factor, buffer); safe_unpack16(&direct_set_prio, buffer); safe_unpack32(&job_state, buffer); safe_unpack16(&kill_on_node_fail, buffer); safe_unpack16(&batch_flag, buffer); safe_unpack16(&mail_type, buffer); safe_unpack32(&state_reason, buffer); safe_unpack32(&state_reason_prev_db, buffer); safe_unpack8 (&reboot, buffer); safe_unpack16(&restart_cnt, buffer); safe_unpack16(&wait_all_nodes, buffer); safe_unpack16(&warn_flags, buffer); safe_unpack16(&warn_signal, buffer); safe_unpack16(&warn_time, buffer); _unpack_acct_policy_limit_members(&limit_set, buffer, protocol_version); safe_unpackstr_xmalloc(&state_desc, &name_len, buffer); safe_unpackstr_xmalloc(&resp_host, &name_len, buffer); safe_unpack16(&alloc_resp_port, buffer); safe_unpack16(&other_port, buffer); safe_unpack8(&power_flags, buffer); safe_unpack16(&start_protocol_ver, buffer); safe_unpackdouble(&billable_tres, buffer); if (job_state & JOB_COMPLETING) { safe_unpackstr_xmalloc(&nodes_completing, &name_len, buffer); } safe_unpackstr_xmalloc(&nodes, &name_len, buffer); safe_unpackstr_xmalloc(&partition, &name_len, buffer); if (partition == NULL) { error("No partition for JobId=%u", job_id); goto unpack_error; } part_ptr = find_part_record (partition); if (part_ptr == NULL) { char *err_part = NULL; part_ptr_list = get_part_list(partition, &err_part); if (part_ptr_list) { part_ptr = list_peek(part_ptr_list); if (list_count(part_ptr_list) == 1) FREE_NULL_LIST(part_ptr_list); } else { verbose("Invalid partition (%s) for JobId=%u", err_part, job_id); xfree(err_part); /* not fatal error, partition could have been * removed, reset_job_bitmaps() will clean-up * this job */ } } safe_unpackstr_xmalloc(&name, &name_len, buffer); safe_unpackstr_xmalloc(&user_name, &name_len, buffer); safe_unpackstr_xmalloc(&wckey, &name_len, buffer); safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer); safe_unpackstr_xmalloc(&account, &name_len, buffer); safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer); safe_unpackstr_xmalloc(&comment, &name_len, buffer); safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer); safe_unpackstr_xmalloc(&gres_req, &name_len, buffer); safe_unpackstr_xmalloc(&gres_used, &name_len, buffer); safe_unpackstr_xmalloc(&network, &name_len, buffer); safe_unpackstr_xmalloc(&licenses, &name_len, buffer); safe_unpackstr_xmalloc(&mail_user, &name_len, buffer); safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer); safe_unpackstr_xmalloc(&resv_name, &name_len, buffer); safe_unpackstr_xmalloc(&batch_host, &name_len, buffer); safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer); safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer); safe_unpackstr_xmalloc(&system_comment, &name_len, buffer); if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, protocol_version)) goto unpack_error; if (unpack_job_resources(&job_resources, buffer, protocol_version)) goto unpack_error; safe_unpack16(&uint16_tmp, buffer); /* was ckpt_interval */ /* fake out the former checkpoint plugin */ { uint16_t id; uint32_t size; safe_unpack16(&id, buffer); safe_unpack32(&size, buffer); /* skip past any checkpoint plugin info */ size += get_buf_offset(buffer); set_buf_offset(buffer, size); } safe_unpackstr_array(&spank_job_env, &spank_job_env_size, buffer); if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id, protocol_version) != SLURM_SUCCESS) goto unpack_error; gres_plugin_job_state_log(gres_list, job_id); safe_unpack16(&details, buffer); if ((details == DETAILS_FLAG) && (_load_job_details(job_ptr, buffer, protocol_version))) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_SYSTEM; xfree(job_ptr->state_desc); job_ptr->end_time = now; goto unpack_error; } safe_unpack16(&step_flag, buffer); /* * The batch_host is needed to create a step_layout for the * batch step since that wasn't packed until 20.02. */ job_ptr->batch_host = batch_host; while (step_flag == STEP_FLAG) { /* * No need to put these into accounting if they * haven't been since all information will be * put in when the job is finished. */ if ((error_code = load_step_state(job_ptr, buffer, protocol_version))) goto unpack_error; safe_unpack16(&step_flag, buffer); } job_ptr->batch_host = NULL; safe_unpack32(&job_ptr->bit_flags, buffer); job_ptr->bit_flags &= ~BACKFILL_TEST; job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST; safe_unpackstr_xmalloc(&tres_alloc_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_fmt_alloc_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer); safe_unpackstr_xmalloc(&clusters, &name_len, buffer); if ((error_code = _load_job_fed_details(&job_fed_details, buffer, protocol_version))) goto unpack_error; safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len, buffer); } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { uint16_t uint16_tmp; safe_unpack32(&array_job_id, buffer); safe_unpack32(&array_task_id, buffer); /* Job Array record */ safe_unpack32(&task_id_size, buffer); if (task_id_size != NO_VAL) { if (task_id_size) { safe_unpackstr_xmalloc(&task_id_str, &name_len, buffer); } safe_unpack32(&array_flags, buffer); safe_unpack32(&max_run_tasks, buffer); safe_unpack32(&tot_run_tasks, buffer); safe_unpack32(&min_exit_code, buffer); safe_unpack32(&max_exit_code, buffer); safe_unpack32(&tot_comp_tasks, buffer); } safe_unpack32(&assoc_id, buffer); safe_unpackstr_xmalloc(&batch_features, &name_len, buffer); safe_unpack32(&delay_boot, buffer); safe_unpack32(&job_id, buffer); /* validity test as possible */ if (job_id == 0) { verbose("Invalid job_id %u", job_id); goto unpack_error; } job_ptr = find_job_record(job_id); if (job_ptr == NULL) { job_ptr = _create_job_record(1); if (!job_ptr) { error("Create job entry failed for JobId=%u", job_id); goto unpack_error; } job_ptr->job_id = job_id; job_ptr->array_job_id = array_job_id; job_ptr->array_task_id = array_task_id; } safe_unpack32(&user_id, buffer); safe_unpack32(&group_id, buffer); safe_unpack32(&time_limit, buffer); safe_unpack32(&time_min, buffer); safe_unpack32(&priority, buffer); safe_unpack32(&alloc_sid, buffer); safe_unpack32(&total_cpus, buffer); safe_unpack32(&total_nodes, buffer); safe_unpack32(&cpu_cnt, buffer); safe_unpack32(&exit_code, buffer); safe_unpack32(&derived_ec, buffer); safe_unpack64(&db_index, buffer); safe_unpack32(&resv_id, buffer); safe_unpack32(&next_step_id, buffer); safe_unpack32(&het_job_id, buffer); safe_unpackstr_xmalloc(&het_job_id_set, &name_len, buffer); safe_unpack32(&het_job_offset, buffer); safe_unpack32(&qos_id, buffer); safe_unpack32(&req_switch, buffer); safe_unpack32(&wait4switch, buffer); safe_unpack32(&profile, buffer); safe_unpack_time(&last_sched_eval, buffer); safe_unpack_time(&preempt_time, buffer); safe_unpack_time(&start_time, buffer); safe_unpack_time(&end_time, buffer); safe_unpack_time(&end_time_exp, buffer); safe_unpack_time(&suspend_time, buffer); safe_unpack_time(&pre_sus_time, buffer); safe_unpack_time(&resize_time, buffer); safe_unpack_time(&tot_sus_time, buffer); safe_unpack_time(&deadline, buffer); safe_unpack16(&direct_set_prio, buffer); safe_unpack32(&job_state, buffer); safe_unpack16(&kill_on_node_fail, buffer); safe_unpack16(&batch_flag, buffer); safe_unpack16(&mail_type, buffer); safe_unpack16(&tmp16, buffer); state_reason = tmp16; safe_unpack8 (&reboot, buffer); safe_unpack16(&restart_cnt, buffer); safe_unpack16(&wait_all_nodes, buffer); safe_unpack16(&warn_flags, buffer); safe_unpack16(&warn_signal, buffer); safe_unpack16(&warn_time, buffer); _unpack_acct_policy_limit_members(&limit_set, buffer, protocol_version); safe_unpackstr_xmalloc(&state_desc, &name_len, buffer); safe_unpackstr_xmalloc(&resp_host, &name_len, buffer); safe_unpack16(&alloc_resp_port, buffer); safe_unpack16(&other_port, buffer); safe_unpack8(&power_flags, buffer); safe_unpack16(&start_protocol_ver, buffer); safe_unpackdouble(&billable_tres, buffer); if (job_state & JOB_COMPLETING) { safe_unpackstr_xmalloc(&nodes_completing, &name_len, buffer); } safe_unpackstr_xmalloc(&nodes, &name_len, buffer); safe_unpackstr_xmalloc(&partition, &name_len, buffer); if (partition == NULL) { error("No partition for JobId=%u", job_id); goto unpack_error; } part_ptr = find_part_record (partition); if (part_ptr == NULL) { char *err_part = NULL; part_ptr_list = get_part_list(partition, &err_part); if (part_ptr_list) { part_ptr = list_peek(part_ptr_list); if (list_count(part_ptr_list) == 1) FREE_NULL_LIST(part_ptr_list); } else { verbose("Invalid partition (%s) for JobId=%u", err_part, job_id); xfree(err_part); /* not fatal error, partition could have been * removed, reset_job_bitmaps() will clean-up * this job */ } } safe_unpackstr_xmalloc(&name, &name_len, buffer); safe_unpackstr_xmalloc(&user_name, &name_len, buffer); safe_unpackstr_xmalloc(&wckey, &name_len, buffer); safe_unpackstr_xmalloc(&alloc_node, &name_len, buffer); safe_unpackstr_xmalloc(&account, &name_len, buffer); safe_unpackstr_xmalloc(&admin_comment, &name_len, buffer); safe_unpackstr_xmalloc(&comment, &name_len, buffer); safe_unpackstr_xmalloc(&gres_alloc, &name_len, buffer); safe_unpackstr_xmalloc(&gres_req, &name_len, buffer); safe_unpackstr_xmalloc(&gres_used, &name_len, buffer); safe_unpackstr_xmalloc(&network, &name_len, buffer); safe_unpackstr_xmalloc(&licenses, &name_len, buffer); safe_unpackstr_xmalloc(&mail_user, &name_len, buffer); safe_unpackstr_xmalloc(&mcs_label, &name_len, buffer); safe_unpackstr_xmalloc(&resv_name, &name_len, buffer); safe_unpackstr_xmalloc(&batch_host, &name_len, buffer); safe_unpackstr_xmalloc(&burst_buffer, &name_len, buffer); safe_unpackstr_xmalloc(&burst_buffer_state, &name_len, buffer); safe_unpackstr_xmalloc(&system_comment, &name_len, buffer); if (select_g_select_jobinfo_unpack(&select_jobinfo, buffer, protocol_version)) goto unpack_error; if (unpack_job_resources(&job_resources, buffer, protocol_version)) goto unpack_error; safe_unpack16(&uint16_tmp, buffer); /* was ckpt_interval */ /* fake out the former checkpoint plugin */ { uint16_t id; uint32_t size; safe_unpack16(&id, buffer); safe_unpack32(&size, buffer); /* skip past any checkpoint plugin info */ size += get_buf_offset(buffer); set_buf_offset(buffer, size); } safe_unpackstr_array(&spank_job_env, &spank_job_env_size, buffer); if (gres_plugin_job_state_unpack(&gres_list, buffer, job_id, protocol_version) != SLURM_SUCCESS) goto unpack_error; gres_plugin_job_state_log(gres_list, job_id); safe_unpack16(&details, buffer); if ((details == DETAILS_FLAG) && (_load_job_details(job_ptr, buffer, protocol_version))) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_SYSTEM; xfree(job_ptr->state_desc); job_ptr->end_time = now; goto unpack_error; } safe_unpack16(&step_flag, buffer); while (step_flag == STEP_FLAG) { /* * No need to put these into accounting if they * haven't been since all information will be * put in when the job is finished. */ if ((error_code = load_step_state(job_ptr, buffer, protocol_version))) goto unpack_error; safe_unpack16(&step_flag, buffer); } safe_unpack32(&job_ptr->bit_flags, buffer); job_ptr->bit_flags &= ~BACKFILL_TEST; job_ptr->bit_flags |= JOB_MEM_SET; safe_unpackstr_xmalloc(&tres_alloc_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_fmt_alloc_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_req_str, &name_len, buffer); safe_unpackstr_xmalloc(&tres_fmt_req_str, &name_len, buffer); safe_unpackstr_xmalloc(&clusters, &name_len, buffer); if ((error_code = _load_job_fed_details(&job_fed_details, buffer, protocol_version))) goto unpack_error; safe_unpackstr_xmalloc(&job_ptr->origin_cluster, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->cpus_per_tres, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->mem_per_tres, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_bind, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_freq, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_job, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_node, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_socket, &name_len, buffer); safe_unpackstr_xmalloc(&job_ptr->tres_per_task, &name_len, buffer); } else { error("%s: protocol_version %hu not supported", __func__, protocol_version); goto unpack_error; } /* "Don't load "unlinked" job. */ if (job_ptr->job_id == NO_VAL) { debug("skipping unlinked job"); rc = SLURM_SUCCESS; goto free_it; } if (((job_state & JOB_STATE_BASE) >= JOB_END) || (batch_flag > MAX_BATCH_REQUEUE)) { error("Invalid data for JobId=%u: job_state=%u batch_flag=%u", job_id, job_state, batch_flag); goto unpack_error; } if (kill_on_node_fail > 1) { error("Invalid data for JobId=%u: kill_on_node_fail=%u", job_id, kill_on_node_fail); goto unpack_error; } if ((priority > 1) && (direct_set_prio == 0)) { highest_prio = MAX(highest_prio, priority); lowest_prio = MIN(lowest_prio, priority); } #if 0 /* * This is not necessary since the job_id_sequence is checkpointed and * the jobid will be checked if it's in use in get_next_job_id(). */ /* Base job_id_sequence off of local job id but only if the job * originated from this cluster -- so that the local job id of a * different cluster isn't restored here. */ if (!job_fed_details || !xstrcmp(job_fed_details->origin_str, slurmctld_conf.cluster_name)) local_job_id = fed_mgr_get_local_id(job_id); if (job_id_sequence <= local_job_id) job_id_sequence = local_job_id + 1; #endif xfree(job_ptr->tres_alloc_str); job_ptr->tres_alloc_str = tres_alloc_str; tres_alloc_str = NULL; xfree(job_ptr->tres_req_str); job_ptr->tres_req_str = tres_req_str; tres_req_str = NULL; xfree(job_ptr->tres_fmt_alloc_str); job_ptr->tres_fmt_alloc_str = tres_fmt_alloc_str; tres_fmt_alloc_str = NULL; xfree(job_ptr->tres_fmt_req_str); job_ptr->tres_fmt_req_str = tres_fmt_req_str; tres_fmt_req_str = NULL; xfree(job_ptr->account); job_ptr->account = account; xstrtolower(job_ptr->account); account = NULL; /* reused, nothing left to free */ xfree(job_ptr->alloc_node); job_ptr->alloc_node = alloc_node; alloc_node = NULL; /* reused, nothing left to free */ job_ptr->alloc_resp_port = alloc_resp_port; job_ptr->alloc_sid = alloc_sid; job_ptr->assoc_id = assoc_id; job_ptr->delay_boot = delay_boot; xfree(job_ptr->admin_comment); job_ptr->admin_comment = admin_comment; admin_comment = NULL; /* reused, nothing left to free */ xfree(job_ptr->system_comment); job_ptr->system_comment = system_comment; system_comment = NULL; /* reused, nothing left to free */ xfree(job_ptr->batch_features); job_ptr->batch_features = batch_features; batch_features = NULL; /* reused, nothing left to free */ job_ptr->batch_flag = batch_flag; xfree(job_ptr->batch_host); job_ptr->batch_host = batch_host; batch_host = NULL; /* reused, nothing left to free */ xfree(job_ptr->burst_buffer); job_ptr->burst_buffer = burst_buffer; burst_buffer = NULL; /* reused, nothing left to free */ xfree(job_ptr->burst_buffer_state); job_ptr->burst_buffer_state = burst_buffer_state; burst_buffer_state = NULL; /* reused, nothing left to free */ xfree(job_ptr->comment); job_ptr->comment = comment; comment = NULL; /* reused, nothing left to free */ job_ptr->billable_tres = billable_tres; xfree(job_ptr->gres_alloc); job_ptr->gres_alloc = gres_alloc; gres_alloc = NULL; /* reused, nothing left to free */ xfree(job_ptr->gres_req); job_ptr->gres_req = gres_req; gres_req = NULL; /* reused, nothing left to free */ xfree(job_ptr->gres_used); job_ptr->gres_used = gres_used; gres_used = NULL; /* reused, nothing left to free */ job_ptr->gres_list = gres_list; job_ptr->site_factor = site_factor; job_ptr->direct_set_prio = direct_set_prio; job_ptr->db_index = db_index; job_ptr->derived_ec = derived_ec; job_ptr->end_time_exp = end_time_exp; job_ptr->end_time = end_time; job_ptr->exit_code = exit_code; job_ptr->group_id = group_id; job_ptr->job_state = job_state; job_ptr->kill_on_node_fail = kill_on_node_fail; xfree(job_ptr->licenses); job_ptr->licenses = licenses; licenses = NULL; /* reused, nothing left to free */ job_ptr->mail_type = mail_type; xfree(job_ptr->mail_user); if (mail_user) job_ptr->mail_user = mail_user; else if (mail_type) job_ptr->mail_user = _get_mail_user(NULL, user_id); mail_user = NULL; /* reused, nothing left to free */ xfree(job_ptr->mcs_label); job_ptr->mcs_label = mcs_label; mcs_label = NULL; /* reused, nothing left to free */ xfree(job_ptr->name); /* in case duplicate record */ job_ptr->name = name; name = NULL; /* reused, nothing left to free */ xfree(job_ptr->user_name); job_ptr->user_name = user_name; user_name = NULL; /* reused, nothing left to free */ xfree(job_ptr->wckey); /* in case duplicate record */ job_ptr->wckey = wckey; xstrtolower(job_ptr->wckey); wckey = NULL; /* reused, nothing left to free */ xfree(job_ptr->network); job_ptr->network = network; network = NULL; /* reused, nothing left to free */ job_ptr->next_step_id = next_step_id; xfree(job_ptr->nodes); /* in case duplicate record */ job_ptr->nodes = nodes; nodes = NULL; /* reused, nothing left to free */ if (nodes_completing) { xfree(job_ptr->nodes_completing); job_ptr->nodes_completing = nodes_completing; nodes_completing = NULL; /* reused, nothing left to free */ } job_ptr->other_port = other_port; job_ptr->power_flags = power_flags; job_ptr->het_job_id = het_job_id; xfree(job_ptr->het_job_id_set); job_ptr->het_job_id_set = het_job_id_set; het_job_id_set = NULL; /* reused, nothing left to free */ job_ptr->het_job_offset = het_job_offset; xfree(job_ptr->partition); job_ptr->partition = partition; partition = NULL; /* reused, nothing left to free */ job_ptr->part_ptr = part_ptr; job_ptr->part_ptr_list = part_ptr_list; job_ptr->pre_sus_time = pre_sus_time; job_ptr->priority = priority; job_ptr->qos_id = qos_id; job_ptr->reboot = reboot; xfree(job_ptr->resp_host); job_ptr->resp_host = resp_host; resp_host = NULL; /* reused, nothing left to free */ job_ptr->resize_time = resize_time; job_ptr->restart_cnt = restart_cnt; job_ptr->resv_id = resv_id; job_ptr->resv_name = resv_name; resv_name = NULL; /* reused, nothing left to free */ job_ptr->select_jobinfo = select_jobinfo; job_ptr->job_resrcs = job_resources; job_ptr->spank_job_env = spank_job_env; job_ptr->spank_job_env_size = spank_job_env_size; job_ptr->start_time = start_time; job_ptr->state_reason = state_reason; job_ptr->state_reason_prev_db = state_reason_prev_db; job_ptr->state_desc = state_desc; state_desc = NULL; /* reused, nothing left to free */ job_ptr->suspend_time = suspend_time; job_ptr->deadline = deadline; if (task_id_size != NO_VAL) { if (!job_ptr->array_recs) job_ptr->array_recs=xmalloc(sizeof(job_array_struct_t)); FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap); xfree(job_ptr->array_recs->task_id_str); if (task_id_size) { job_ptr->array_recs->task_id_bitmap = bit_alloc(task_id_size); if (task_id_str) { bit_unfmt_hexmask( job_ptr->array_recs->task_id_bitmap, task_id_str); job_ptr->array_recs->task_id_str = task_id_str; task_id_str = NULL; } job_ptr->array_recs->task_cnt = bit_set_count(job_ptr->array_recs-> task_id_bitmap); if (job_ptr->array_recs->task_cnt > 1) job_count += (job_ptr->array_recs->task_cnt-1); } else xfree(task_id_str); job_ptr->array_recs->array_flags = array_flags; job_ptr->array_recs->max_run_tasks = max_run_tasks; job_ptr->array_recs->tot_run_tasks = tot_run_tasks; job_ptr->array_recs->min_exit_code = min_exit_code; job_ptr->array_recs->max_exit_code = max_exit_code; job_ptr->array_recs->tot_comp_tasks = tot_comp_tasks; } job_ptr->time_last_active = now; job_ptr->time_limit = time_limit; job_ptr->time_min = time_min; job_ptr->total_cpus = total_cpus; if (IS_JOB_PENDING(job_ptr)) job_ptr->node_cnt_wag = total_nodes; else job_ptr->total_nodes = total_nodes; job_ptr->cpu_cnt = cpu_cnt; job_ptr->tot_sus_time = tot_sus_time; job_ptr->last_sched_eval = last_sched_eval; job_ptr->preempt_time = preempt_time; job_ptr->user_id = user_id; job_ptr->wait_all_nodes = wait_all_nodes; job_ptr->warn_flags = warn_flags; job_ptr->warn_signal = warn_signal; job_ptr->warn_time = warn_time; memcpy(&job_ptr->limit_set, &limit_set, sizeof(acct_policy_limit_set_t)); limit_set.tres = NULL; job_ptr->req_switch = req_switch; job_ptr->wait4switch = wait4switch; job_ptr->profile = profile; job_ptr->db_flags = db_flags; /* * This needs to always to initialized to "true". The select * plugin will deal with it every time it goes through the * logic if req_switch or wait4switch are set. */ job_ptr->best_switch = true; job_ptr->start_protocol_ver = start_protocol_ver; _add_job_hash(job_ptr); _add_job_array_hash(job_ptr); memset(&assoc_rec, 0, sizeof(assoc_rec)); /* * For speed and accurracy we will first see if we once had an * association record. If not look for it by * account,partition, user_id. */ if (job_ptr->assoc_id) assoc_rec.id = job_ptr->assoc_id; else { assoc_rec.acct = job_ptr->account; if (job_ptr->part_ptr) assoc_rec.partition = job_ptr->part_ptr->name; assoc_rec.uid = job_ptr->user_id; } assoc_mgr_lock(&locks); if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, &job_ptr->assoc_ptr, true) && (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) && (!IS_JOB_FINISHED(job_ptr))) { _job_fail_account(job_ptr, __func__); } else { job_ptr->assoc_id = assoc_rec.id; info("Recovered %pJ Assoc=%u", job_ptr, job_ptr->assoc_id); if (job_ptr->state_reason == FAIL_ACCOUNT) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); } /* make sure we have started this job in accounting */ if (!job_ptr->db_index) { debug("starting %pJ in accounting", job_ptr); if (!with_slurmdbd) jobacct_storage_g_job_start( acct_db_conn, job_ptr); if (slurmctld_init_db && IS_JOB_SUSPENDED(job_ptr)) { jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); } } /* make sure we have this job completed in the database */ if (IS_JOB_FINISHED(job_ptr)) { if (slurmctld_init_db && !(job_ptr->bit_flags & TRES_STR_CALC) && job_ptr->tres_alloc_cnt && (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) set_job_tres_alloc_str(job_ptr, false); jobacct_storage_g_job_complete( acct_db_conn, job_ptr); job_finished = 1; } } if (!job_finished && job_ptr->qos_id && (job_ptr->state_reason != FAIL_ACCOUNT)) { memset(&qos_rec, 0, sizeof(qos_rec)); qos_rec.id = job_ptr->qos_id; job_ptr->qos_ptr = _determine_and_validate_qos( job_ptr->resv_name, job_ptr->assoc_ptr, job_ptr->limit_set.qos, &qos_rec, &qos_error, true, LOG_LEVEL_ERROR); if ((qos_error != SLURM_SUCCESS) && !job_ptr->limit_set.qos) { job_fail_qos(job_ptr, __func__); } else { job_ptr->qos_id = qos_rec.id; if (job_ptr->state_reason == FAIL_QOS) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); } } } /* * do this after the format string just in case for some * reason the tres_alloc_str is NULL but not the fmt_str */ if (job_ptr->tres_alloc_str) assoc_mgr_set_tres_cnt_array( &job_ptr->tres_alloc_cnt, job_ptr->tres_alloc_str, 0, true); else job_set_alloc_tres(job_ptr, true); if (job_ptr->tres_req_str) assoc_mgr_set_tres_cnt_array( &job_ptr->tres_req_cnt, job_ptr->tres_req_str, 0, true); else job_set_req_tres(job_ptr, true); assoc_mgr_unlock(&locks); build_node_details(job_ptr, false); /* set node_addr */ gres_build_job_details(job_ptr->gres_list, &job_ptr->gres_detail_cnt, &job_ptr->gres_detail_str, &job_ptr->gres_used); job_ptr->clusters = clusters; job_ptr->fed_details = job_fed_details; return SLURM_SUCCESS; unpack_error: error("Incomplete job record"); rc = SLURM_ERROR; free_it: xfree(alloc_node); xfree(account); xfree(admin_comment); xfree(batch_features); xfree(batch_host); xfree(burst_buffer); xfree(clusters); xfree(comment); xfree(gres_alloc); xfree(gres_req); xfree(gres_used); xfree(het_job_id_set); free_job_fed_details(&job_fed_details); free_job_resources(&job_resources); xfree(resp_host); xfree(licenses); xfree(limit_set.tres); xfree(mail_user); xfree(mcs_label); xfree(name); xfree(nodes); xfree(nodes_completing); xfree(partition); FREE_NULL_LIST(part_ptr_list); xfree(resv_name); for (i = 0; i < spank_job_env_size; i++) xfree(spank_job_env[i]); xfree(spank_job_env); xfree(state_desc); xfree(system_comment); xfree(task_id_str); xfree(tres_alloc_str); xfree(tres_fmt_alloc_str); xfree(tres_fmt_req_str); xfree(tres_req_str); xfree(user_name); xfree(wckey); select_g_select_jobinfo_free(select_jobinfo); if (job_ptr) { if (job_ptr->job_id == 0) job_ptr->job_id = NO_VAL; purge_job_record(job_ptr->job_id); } for (i = 0; i < pelog_env_size; i++) xfree(pelog_env[i]); xfree(pelog_env); return rc; } /* * _dump_job_details - dump the state of a specific job details to * a buffer * IN detail_ptr - pointer to job details for which information is requested * IN/OUT buffer - location to store data, pointers automatically advanced */ void _dump_job_details(struct job_details *detail_ptr, Buf buffer) { /* * Some job fields can change in the course of scheduling, so we * report the original values supplied by the user rather than * an intermediate value that might be set by our scheduling * logic (e.g. to enforce a partition, association or QOS limit). * * Fields subject to change and their original values are as follows: * min_cpus orig_min_cpus * max_cpus orig_max_cpus * cpus_per_task orig_cpus_per_task * pn_min_cpus orig_pn_min_cpus * pn_min_memory orig_pn_min_memory * dependency orig_dependency */ pack32(detail_ptr->orig_min_cpus, buffer); /* subject to change */ pack32(detail_ptr->orig_max_cpus, buffer); /* subject to change */ pack32(detail_ptr->min_nodes, buffer); pack32(detail_ptr->max_nodes, buffer); pack32(detail_ptr->num_tasks, buffer); packstr(detail_ptr->acctg_freq, buffer); pack16(detail_ptr->contiguous, buffer); pack16(detail_ptr->core_spec, buffer); pack16(detail_ptr->orig_cpus_per_task, buffer); /* subject to change */ pack32(detail_ptr->nice, buffer); pack16(detail_ptr->ntasks_per_node, buffer); pack16(detail_ptr->requeue, buffer); pack32(detail_ptr->task_dist, buffer); pack8(detail_ptr->share_res, buffer); pack8(detail_ptr->whole_node, buffer); packstr(detail_ptr->cpu_bind, buffer); pack16(detail_ptr->cpu_bind_type, buffer); packstr(detail_ptr->mem_bind, buffer); pack16(detail_ptr->mem_bind_type, buffer); pack16(detail_ptr->plane_size, buffer); pack8(detail_ptr->open_mode, buffer); pack8(detail_ptr->overcommit, buffer); pack8(detail_ptr->prolog_running, buffer); pack32(detail_ptr->orig_pn_min_cpus, buffer); /* subject to change */ pack64(detail_ptr->orig_pn_min_memory, buffer); /* subject to change */ pack32(detail_ptr->pn_min_tmp_disk, buffer); pack32(detail_ptr->cpu_freq_min, buffer); pack32(detail_ptr->cpu_freq_max, buffer); pack32(detail_ptr->cpu_freq_gov, buffer); pack_time(detail_ptr->begin_time, buffer); pack_time(detail_ptr->accrue_time, buffer); pack_time(detail_ptr->submit_time, buffer); packstr(detail_ptr->req_nodes, buffer); packstr(detail_ptr->exc_nodes, buffer); packstr(detail_ptr->features, buffer); packstr(detail_ptr->cluster_features, buffer); pack_dep_list(detail_ptr->depend_list, buffer, SLURM_PROTOCOL_VERSION); packstr(detail_ptr->dependency, buffer); packstr(detail_ptr->orig_dependency, buffer); /* subject to change */ packstr(detail_ptr->std_err, buffer); packstr(detail_ptr->std_in, buffer); packstr(detail_ptr->std_out, buffer); packstr(detail_ptr->work_dir, buffer); pack_multi_core_data(detail_ptr->mc_ptr, buffer, SLURM_PROTOCOL_VERSION); packstr_array(detail_ptr->argv, detail_ptr->argc, buffer); packstr_array(detail_ptr->env_sup, detail_ptr->env_cnt, buffer); } /* _load_job_details - Unpack a job details information from buffer */ static int _load_job_details(job_record_t *job_ptr, Buf buffer, uint16_t protocol_version) { char *acctg_freq = NULL, *req_nodes = NULL, *exc_nodes = NULL; char *features = NULL, *cpu_bind = NULL, *dependency = NULL; char *orig_dependency = NULL, *mem_bind, *cluster_features = NULL; char *err = NULL, *in = NULL, *out = NULL, *work_dir = NULL; char **argv = (char **) NULL, **env_sup = (char **) NULL; uint32_t min_nodes, max_nodes; uint32_t min_cpus = 1, max_cpus = NO_VAL; uint32_t pn_min_cpus, pn_min_tmp_disk; uint64_t pn_min_memory; uint32_t cpu_freq_min = NO_VAL; uint32_t cpu_freq_max = NO_VAL; uint32_t cpu_freq_gov = NO_VAL, nice = 0; uint32_t num_tasks, name_len, argc = 0, env_cnt = 0, task_dist; uint16_t contiguous, core_spec = NO_VAL16; uint16_t ntasks_per_node, cpus_per_task, requeue; uint16_t cpu_bind_type, mem_bind_type, plane_size; uint8_t open_mode, overcommit, prolog_running; uint8_t share_res, whole_node; time_t begin_time, accrue_time = 0, submit_time; int i; List depend_list = NULL; multi_core_data_t *mc_ptr; /* unpack the job's details from the buffer */ if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) { safe_unpack32(&min_cpus, buffer); safe_unpack32(&max_cpus, buffer); safe_unpack32(&min_nodes, buffer); safe_unpack32(&max_nodes, buffer); safe_unpack32(&num_tasks, buffer); safe_unpackstr_xmalloc(&acctg_freq, &name_len, buffer); safe_unpack16(&contiguous, buffer); safe_unpack16(&core_spec, buffer); safe_unpack16(&cpus_per_task, buffer); safe_unpack32(&nice, buffer); safe_unpack16(&ntasks_per_node, buffer); safe_unpack16(&requeue, buffer); safe_unpack32(&task_dist, buffer); safe_unpack8(&share_res, buffer); safe_unpack8(&whole_node, buffer); safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer); safe_unpack16(&cpu_bind_type, buffer); safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer); safe_unpack16(&mem_bind_type, buffer); safe_unpack16(&plane_size, buffer); safe_unpack8(&open_mode, buffer); safe_unpack8(&overcommit, buffer); safe_unpack8(&prolog_running, buffer); safe_unpack32(&pn_min_cpus, buffer); safe_unpack64(&pn_min_memory, buffer); safe_unpack32(&pn_min_tmp_disk, buffer); safe_unpack32(&cpu_freq_min, buffer); safe_unpack32(&cpu_freq_max, buffer); safe_unpack32(&cpu_freq_gov, buffer); safe_unpack_time(&begin_time, buffer); safe_unpack_time(&accrue_time, buffer); safe_unpack_time(&submit_time, buffer); safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer); safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer); safe_unpackstr_xmalloc(&features, &name_len, buffer); safe_unpackstr_xmalloc(&cluster_features, &name_len, buffer); unpack_dep_list(&depend_list, buffer, protocol_version); safe_unpackstr_xmalloc(&dependency, &name_len, buffer); safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer); safe_unpackstr_xmalloc(&err, &name_len, buffer); safe_unpackstr_xmalloc(&in, &name_len, buffer); safe_unpackstr_xmalloc(&out, &name_len, buffer); safe_unpackstr_xmalloc(&work_dir, &name_len, buffer); if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version)) goto unpack_error; safe_unpackstr_array(&argv, &argc, buffer); safe_unpackstr_array(&env_sup, &env_cnt, buffer); } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { char *temp_str; safe_unpack32(&min_cpus, buffer); safe_unpack32(&max_cpus, buffer); safe_unpack32(&min_nodes, buffer); safe_unpack32(&max_nodes, buffer); safe_unpack32(&num_tasks, buffer); safe_unpackstr_xmalloc(&acctg_freq, &name_len, buffer); safe_unpack16(&contiguous, buffer); safe_unpack16(&core_spec, buffer); safe_unpack16(&cpus_per_task, buffer); safe_unpack32(&nice, buffer); safe_unpack16(&ntasks_per_node, buffer); safe_unpack16(&requeue, buffer); safe_unpack32(&task_dist, buffer); safe_unpack8(&share_res, buffer); safe_unpack8(&whole_node, buffer); safe_unpackstr_xmalloc(&cpu_bind, &name_len, buffer); safe_unpack16(&cpu_bind_type, buffer); safe_unpackstr_xmalloc(&mem_bind, &name_len, buffer); safe_unpack16(&mem_bind_type, buffer); safe_unpack16(&plane_size, buffer); safe_unpack8(&open_mode, buffer); safe_unpack8(&overcommit, buffer); safe_unpack8(&prolog_running, buffer); safe_unpack32(&pn_min_cpus, buffer); safe_unpack64(&pn_min_memory, buffer); safe_unpack32(&pn_min_tmp_disk, buffer); safe_unpack32(&cpu_freq_min, buffer); safe_unpack32(&cpu_freq_max, buffer); safe_unpack32(&cpu_freq_gov, buffer); safe_unpack_time(&begin_time, buffer); safe_unpack_time(&accrue_time, buffer); safe_unpack_time(&submit_time, buffer); safe_unpackstr_xmalloc(&req_nodes, &name_len, buffer); safe_unpackstr_xmalloc(&exc_nodes, &name_len, buffer); safe_unpackstr_xmalloc(&features, &name_len, buffer); safe_unpackstr_xmalloc(&cluster_features, &name_len, buffer); safe_unpackstr_xmalloc(&dependency, &name_len, buffer); safe_unpackstr_xmalloc(&orig_dependency, &name_len, buffer); safe_unpackstr_xmalloc(&err, &name_len, buffer); safe_unpackstr_xmalloc(&in, &name_len, buffer); safe_unpackstr_xmalloc(&out, &name_len, buffer); safe_unpackstr_xmalloc(&work_dir, &name_len, buffer); safe_unpackstr_xmalloc(&temp_str, &name_len, buffer); xfree(temp_str); /* was ckpt_dir */ safe_unpackstr_xmalloc(&temp_str, &name_len, buffer); xfree(temp_str); /* was restart_dir */ if (unpack_multi_core_data(&mc_ptr, buffer, protocol_version)) goto unpack_error; safe_unpackstr_array(&argv, &argc, buffer); safe_unpackstr_array(&env_sup, &env_cnt, buffer); } else { error("_load_job_details: protocol_version " "%hu not supported", protocol_version); goto unpack_error; } /* validity test as possible */ if (contiguous > 1) { error("Invalid data for %pJ: contiguous=%u", job_ptr, contiguous); goto unpack_error; } if ((requeue > 1) || (overcommit > 1)) { error("Invalid data for %pJ: requeue=%u overcommit=%u", job_ptr, requeue, overcommit); goto unpack_error; } if (prolog_running > 4) { error("Invalid data for %pJ: prolog_running=%u", job_ptr, prolog_running); goto unpack_error; } /* free any left-over detail data */ xfree(job_ptr->details->acctg_freq); for (i=0; idetails->argc; i++) xfree(job_ptr->details->argv[i]); xfree(job_ptr->details->argv); xfree(job_ptr->details->cpu_bind); FREE_NULL_LIST(job_ptr->details->depend_list); xfree(job_ptr->details->dependency); xfree(job_ptr->details->orig_dependency); xfree(job_ptr->details->std_err); for (i=0; idetails->env_cnt; i++) xfree(job_ptr->details->env_sup[i]); xfree(job_ptr->details->env_sup); xfree(job_ptr->details->exc_nodes); xfree(job_ptr->details->features); xfree(job_ptr->details->cluster_features); xfree(job_ptr->details->std_in); xfree(job_ptr->details->mem_bind); xfree(job_ptr->details->std_out); xfree(job_ptr->details->req_nodes); xfree(job_ptr->details->work_dir); /* now put the details into the job record */ job_ptr->details->acctg_freq = acctg_freq; job_ptr->details->argc = argc; job_ptr->details->argv = argv; job_ptr->details->accrue_time = accrue_time; job_ptr->details->begin_time = begin_time; job_ptr->details->contiguous = contiguous; job_ptr->details->core_spec = core_spec; job_ptr->details->cpu_bind = cpu_bind; job_ptr->details->cpu_bind_type = cpu_bind_type; job_ptr->details->cpu_freq_min = cpu_freq_min; job_ptr->details->cpu_freq_max = cpu_freq_max; job_ptr->details->cpu_freq_gov = cpu_freq_gov; job_ptr->details->cpus_per_task = cpus_per_task; job_ptr->details->orig_cpus_per_task = cpus_per_task; job_ptr->details->depend_list = depend_list; job_ptr->details->dependency = dependency; job_ptr->details->orig_dependency = orig_dependency; job_ptr->details->env_cnt = env_cnt; job_ptr->details->env_sup = env_sup; job_ptr->details->std_err = err; job_ptr->details->exc_nodes = exc_nodes; job_ptr->details->features = features; job_ptr->details->cluster_features = cluster_features; job_ptr->details->std_in = in; job_ptr->details->pn_min_cpus = pn_min_cpus; job_ptr->details->orig_pn_min_cpus = pn_min_cpus; job_ptr->details->pn_min_memory = pn_min_memory; job_ptr->details->orig_pn_min_memory = pn_min_memory; job_ptr->details->pn_min_tmp_disk = pn_min_tmp_disk; job_ptr->details->max_cpus = max_cpus; job_ptr->details->orig_max_cpus = max_cpus; job_ptr->details->max_nodes = max_nodes; job_ptr->details->mc_ptr = mc_ptr; job_ptr->details->mem_bind = mem_bind; job_ptr->details->mem_bind_type = mem_bind_type; job_ptr->details->min_cpus = min_cpus; job_ptr->details->orig_min_cpus = min_cpus; job_ptr->details->min_nodes = min_nodes; job_ptr->details->nice = nice; job_ptr->details->ntasks_per_node = ntasks_per_node; job_ptr->details->num_tasks = num_tasks; job_ptr->details->open_mode = open_mode; job_ptr->details->std_out = out; job_ptr->details->overcommit = overcommit; job_ptr->details->plane_size = plane_size; job_ptr->details->prolog_running = prolog_running; job_ptr->details->req_nodes = req_nodes; job_ptr->details->requeue = requeue; job_ptr->details->share_res = share_res; job_ptr->details->submit_time = submit_time; job_ptr->details->task_dist = task_dist; job_ptr->details->whole_node = whole_node; job_ptr->details->work_dir = work_dir; return SLURM_SUCCESS; unpack_error: /* for (i=0; ijob_id); job_ptr->job_next = job_hash[inx]; job_hash[inx] = job_ptr; } /* _remove_job_hash - remove a job hash entry for given job record, job_id must * already be set * IN job_ptr - pointer to job record * IN type - which hash to work with * Globals: hash table updated */ static void _remove_job_hash(job_record_t *job_entry, job_hash_type_t type) { job_record_t *job_ptr, **job_pptr; xassert(job_entry); switch (type) { case JOB_HASH_JOB: job_pptr = &job_hash[JOB_HASH_INX(job_entry->job_id)]; break; case JOB_HASH_ARRAY_JOB: job_pptr = &job_array_hash_j[ JOB_HASH_INX(job_entry->array_job_id)]; break; case JOB_HASH_ARRAY_TASK: job_pptr = &job_array_hash_t[ JOB_ARRAY_HASH_INX(job_entry->array_job_id, job_entry->array_task_id)]; break; default: fatal("%s: unknown job_hash_type_t %d", __func__, type); return; } while ((job_pptr != NULL) && (*job_pptr != NULL) && ((job_ptr = *job_pptr) != job_entry)) { xassert(job_ptr->magic == JOB_MAGIC); switch (type) { case JOB_HASH_JOB: job_pptr = &job_ptr->job_next; break; case JOB_HASH_ARRAY_JOB: job_pptr = &job_ptr->job_array_next_j; break; case JOB_HASH_ARRAY_TASK: job_pptr = &job_ptr->job_array_next_t; break; } } if (job_pptr == NULL || *job_pptr == NULL) { if (job_entry->job_id == NO_VAL) return; switch (type) { case JOB_HASH_JOB: error("%s: Could not find hash entry for JobId=%u", __func__, job_entry->job_id); break; case JOB_HASH_ARRAY_JOB: error("%s: job array hash error %u", __func__, job_entry->array_job_id); break; case JOB_HASH_ARRAY_TASK: error("%s: job array, task ID hash error %u_%u", __func__, job_entry->array_job_id, job_entry->array_task_id); break; } return; } switch (type) { case JOB_HASH_JOB: *job_pptr = job_entry->job_next; job_entry->job_next = NULL; break; case JOB_HASH_ARRAY_JOB: *job_pptr = job_entry->job_array_next_j; job_entry->job_array_next_j = NULL; break; case JOB_HASH_ARRAY_TASK: *job_pptr = job_entry->job_array_next_t; job_entry->job_array_next_t = NULL; break; } } /* _add_job_array_hash - add a job hash entry for given job record, * array_job_id and array_task_id must already be set * IN job_ptr - pointer to job record * Globals: hash table updated */ void _add_job_array_hash(job_record_t *job_ptr) { int inx; if (job_ptr->array_task_id == NO_VAL) return; /* Not a job array */ inx = JOB_HASH_INX(job_ptr->array_job_id); job_ptr->job_array_next_j = job_array_hash_j[inx]; job_array_hash_j[inx] = job_ptr; inx = JOB_ARRAY_HASH_INX(job_ptr->array_job_id,job_ptr->array_task_id); job_ptr->job_array_next_t = job_array_hash_t[inx]; job_array_hash_t[inx] = job_ptr; } /* For the job array data structure, build the string representation of the * bitmap. * NOTE: bit_fmt_hexmask() is far more scalable than bit_fmt(). */ extern void build_array_str(job_record_t *job_ptr) { job_array_struct_t *array_recs = job_ptr->array_recs; if (!array_recs || array_recs->task_id_str || !array_recs->task_id_bitmap || (job_ptr->array_task_id != NO_VAL) || (bit_ffs(job_ptr->array_recs->task_id_bitmap) == -1)) return; array_recs->task_id_str = bit_fmt_hexmask(array_recs->task_id_bitmap); /* While it is efficient to set the db_index to 0 here * to get the database to update the record for * pending tasks it also creates a window in which if * the association id is changed (different account or * partition) instead of returning the previous * db_index (expected) it would create a new one * leaving the other orphaned. Setting the job_state * sets things up so the db_index isn't lost but the * start message is still sent to get the desired behavior. */ /* Here we set the JOB_UPDATE_DB flag so we resend the start of the * job updating the array task string and count of pending * jobs. This is faster than sending the start again since * this could happen many times (like lots of array elements * starting at once) instead of just ever so often. */ if (job_ptr->db_index) job_ptr->job_state |= JOB_UPDATE_DB; } /* Return true if ALL tasks of specific array job ID are complete */ extern bool test_job_array_complete(uint32_t array_job_id) { job_record_t *job_ptr; int inx; job_ptr = find_job_record(array_job_id); if (job_ptr) { if (!IS_JOB_COMPLETE(job_ptr)) return false; if (job_ptr->array_recs && job_ptr->array_recs->max_exit_code) return false; } /* Need to test individual job array records */ inx = JOB_HASH_INX(array_job_id); job_ptr = job_array_hash_j[inx]; while (job_ptr) { if (job_ptr->array_job_id == array_job_id) { if (!IS_JOB_COMPLETE(job_ptr)) return false; } job_ptr = job_ptr->job_array_next_j; } return true; } /* Return true if ALL tasks of specific array job ID are completed */ extern bool test_job_array_completed(uint32_t array_job_id) { job_record_t *job_ptr; int inx; job_ptr = find_job_record(array_job_id); if (job_ptr) { if (!IS_JOB_COMPLETED(job_ptr)) return false; } /* Need to test individual job array records */ inx = JOB_HASH_INX(array_job_id); job_ptr = job_array_hash_j[inx]; while (job_ptr) { if (job_ptr->array_job_id == array_job_id) { if (!IS_JOB_COMPLETED(job_ptr)) return false; } job_ptr = job_ptr->job_array_next_j; } return true; } /* * Return true if ALL tasks of specific array job ID are completed AND * all except for the head job have been purged. */ extern bool _test_job_array_purged(uint32_t array_job_id) { job_record_t *job_ptr, *head_job_ptr; int inx; head_job_ptr = find_job_record(array_job_id); if (head_job_ptr) { if (!IS_JOB_COMPLETED(head_job_ptr)) return false; } /* Need to test individual job array records */ inx = JOB_HASH_INX(array_job_id); job_ptr = job_array_hash_j[inx]; while (job_ptr) { if ((job_ptr->array_job_id == array_job_id) && (job_ptr != head_job_ptr)) { return false; } job_ptr = job_ptr->job_array_next_j; } return true; } /* Return true if ALL tasks of specific array job ID are finished */ extern bool test_job_array_finished(uint32_t array_job_id) { job_record_t *job_ptr; int inx; job_ptr = find_job_record(array_job_id); if (job_ptr) { if (!IS_JOB_FINISHED(job_ptr)) return false; } /* Need to test individual job array records */ inx = JOB_HASH_INX(array_job_id); job_ptr = job_array_hash_j[inx]; while (job_ptr) { if (job_ptr->array_job_id == array_job_id) { if (!IS_JOB_FINISHED(job_ptr)) return false; } job_ptr = job_ptr->job_array_next_j; } return true; } /* Return true if ANY tasks of specific array job ID are pending */ extern bool test_job_array_pending(uint32_t array_job_id) { job_record_t *job_ptr; int inx; job_ptr = find_job_record(array_job_id); if (job_ptr) { if (IS_JOB_PENDING(job_ptr)) return true; if (job_ptr->array_recs && job_ptr->array_recs->task_cnt) return true; } /* Need to test individual job array records */ inx = JOB_HASH_INX(array_job_id); job_ptr = job_array_hash_j[inx]; while (job_ptr) { if (job_ptr->array_job_id == array_job_id) { if (IS_JOB_PENDING(job_ptr)) return true; } job_ptr = job_ptr->job_array_next_j; } return false; } /* For a given job ID return the number of PENDING tasks which have their * own separate job_record (do not count tasks in pending META job record) */ extern int num_pending_job_array_tasks(uint32_t array_job_id) { job_record_t *job_ptr; int count = 0, inx; inx = JOB_HASH_INX(array_job_id); job_ptr = job_array_hash_j[inx]; while (job_ptr) { if ((job_ptr->array_job_id == array_job_id) && IS_JOB_PENDING(job_ptr)) count++; job_ptr = job_ptr->job_array_next_j; } return count; } /* * find_job_array_rec - return a pointer to the job record with the given * array_job_id/array_task_id * IN job_id - requested job's id * IN array_task_id - requested job's task id, * NO_VAL if none specified (i.e. not a job array) * INFINITE return any task for specified job id * RET pointer to the job's record, NULL on error */ extern job_record_t *find_job_array_rec(uint32_t array_job_id, uint32_t array_task_id) { job_record_t *job_ptr, *match_job_ptr = NULL; int inx; if (array_task_id == NO_VAL) return find_job_record(array_job_id); if (array_task_id == INFINITE) { /* find by job ID */ /* Look for job record with all of the pending tasks */ job_ptr = find_job_record(array_job_id); if (job_ptr && job_ptr->array_recs && (job_ptr->array_job_id == array_job_id)) return job_ptr; inx = JOB_HASH_INX(array_job_id); job_ptr = job_array_hash_j[inx]; while (job_ptr) { if (job_ptr->array_job_id == array_job_id) { match_job_ptr = job_ptr; if (!IS_JOB_FINISHED(job_ptr)) { return job_ptr; } } job_ptr = job_ptr->job_array_next_j; } return match_job_ptr; } else { /* Find specific task ID */ inx = JOB_ARRAY_HASH_INX(array_job_id, array_task_id); job_ptr = job_array_hash_t[inx]; while (job_ptr) { if ((job_ptr->array_job_id == array_job_id) && (job_ptr->array_task_id == array_task_id)) { return job_ptr; } job_ptr = job_ptr->job_array_next_t; } /* Look for job record with all of the pending tasks */ job_ptr = find_job_record(array_job_id); if (job_ptr && job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) { inx = bit_size(job_ptr->array_recs->task_id_bitmap); if ((array_task_id < inx) && bit_test(job_ptr->array_recs->task_id_bitmap, array_task_id)) { return job_ptr; } } return NULL; /* None found */ } } /* * find_het_job_record - return a pointer to the job record with the given ID * IN job_id - requested job's ID * in het_job_id - hetjob component ID * RET pointer to the job's record, NULL on error */ extern job_record_t *find_het_job_record(uint32_t job_id, uint32_t het_job_id) { job_record_t *het_job_leader, *het_job; ListIterator iter; het_job_leader = job_hash[JOB_HASH_INX(job_id)]; while (het_job_leader) { if (het_job_leader->job_id == job_id) break; het_job_leader = het_job_leader->job_next; } if (!het_job_leader) return NULL; if (het_job_leader->het_job_offset == het_job_id) return het_job_leader; if (!het_job_leader->het_job_list) return NULL; iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) { if (het_job_leader->het_job_id != het_job->het_job_id) { error("%s: Bad het_job_list for %pJ", __func__, het_job_leader); continue; } if (het_job->het_job_offset == het_job_id) break; } list_iterator_destroy(iter); return het_job; } /* * find_job_record - return a pointer to the job record with the given job_id * IN job_id - requested job's id * RET pointer to the job's record, NULL on error */ extern job_record_t *find_job_record(uint32_t job_id) { job_record_t *job_ptr; job_ptr = job_hash[JOB_HASH_INX(job_id)]; while (job_ptr) { if (job_ptr->job_id == job_id) return job_ptr; job_ptr = job_ptr->job_next; } return NULL; } /* rebuild a job's partition name list based upon the contents of its * part_ptr_list */ static void _rebuild_part_name_list(job_record_t *job_ptr) { bool job_active = false, job_pending = false; part_record_t *part_ptr; ListIterator part_iterator; xfree(job_ptr->partition); if (!job_ptr->part_ptr_list) { job_ptr->partition = xstrdup(job_ptr->part_ptr->name); last_job_update = time(NULL); return; } if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { job_active = true; job_ptr->partition = xstrdup(job_ptr->part_ptr->name); } else if (IS_JOB_PENDING(job_ptr)) job_pending = true; part_iterator = list_iterator_create(job_ptr->part_ptr_list); while ((part_ptr = list_next(part_iterator))) { if (job_pending) { /* Reset job's one partition to a valid one */ job_ptr->part_ptr = part_ptr; job_pending = false; } if (job_active && (part_ptr == job_ptr->part_ptr)) continue; /* already added */ if (job_ptr->partition) xstrcat(job_ptr->partition, ","); xstrcat(job_ptr->partition, part_ptr->name); } list_iterator_destroy(part_iterator); last_job_update = time(NULL); } /* * Kill job or job step * * IN job_step_kill_msg - msg with specs on which job/step to cancel. * IN uid - uid of user requesting job/step cancel. */ static int _kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid) { DEF_TIMERS; /* Locks: Read config, write job, write node, read fed */ slurmctld_lock_t job_write_lock = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, NO_LOCK, READ_LOCK }; job_record_t *job_ptr; int error_code = SLURM_SUCCESS; START_TIMER; lock_slurmctld(job_write_lock); job_ptr = find_job_record(job_step_kill_msg->job_id); trace_job(job_ptr, __func__, "enter"); /* do RPC call */ if (job_step_kill_msg->job_step_id == SLURM_BATCH_SCRIPT) { /* NOTE: SLURM_BATCH_SCRIPT == NO_VAL */ error_code = job_signal_id(job_step_kill_msg->job_id, job_step_kill_msg->signal, job_step_kill_msg->flags, uid, false); unlock_slurmctld(job_write_lock); END_TIMER2(__func__); /* return result */ if (error_code) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) info("Signal %u %pJ by UID=%u: %s", job_step_kill_msg->signal, job_ptr, uid, slurm_strerror(error_code)); } else { if (job_step_kill_msg->signal == SIGKILL) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) info("%s: Cancel of %pJ by UID=%u, %s", __func__, job_ptr, uid, TIME_STR); slurmctld_diag_stats.jobs_canceled++; } else { if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) info("%s: Signal %u of %pJ by UID=%u, %s", __func__, job_step_kill_msg->signal, job_ptr, uid, TIME_STR); } /* Below function provides its own locking */ schedule_job_save(); } } else { error_code = job_step_signal(job_step_kill_msg->job_id, job_step_kill_msg->job_step_id, job_step_kill_msg->signal, job_step_kill_msg->flags, uid); unlock_slurmctld(job_write_lock); END_TIMER2(__func__); /* return result */ if (error_code) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) info("Signal %u of JobId=%u StepId=%u by UID=%u: %s", job_step_kill_msg->signal, job_step_kill_msg->job_id, job_step_kill_msg->job_step_id, uid, slurm_strerror(error_code)); } else { if (job_step_kill_msg->signal == SIGKILL) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) info("%s: Cancel of JobId=%u StepId=%u by UID=%u %s", __func__, job_step_kill_msg->job_id, job_step_kill_msg->job_step_id, uid, TIME_STR); } else { if (slurmctld_conf.debug_flags & DEBUG_FLAG_STEPS) info("%s: Signal %u of JobId=%u StepId=%u by UID=%u %s", __func__, job_step_kill_msg->signal, job_step_kill_msg->job_id, job_step_kill_msg->job_step_id, uid, TIME_STR); } /* Below function provides its own locking */ schedule_job_save(); } } trace_job(job_ptr, __func__, "return"); return error_code; } /* * Kill job or job step * * IN job_step_kill_msg - msg with specs on which job/step to cancel. * IN uid - uid of user requesting job/step cancel. */ extern int kill_job_step(job_step_kill_msg_t *job_step_kill_msg, uint32_t uid) { /* Locks: Read job */ slurmctld_lock_t job_read_lock = { NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; job_record_t *job_ptr, *het_job_ptr; uint32_t *het_job_ids = NULL; int cnt = 0, i, rc; int error_code = SLURM_SUCCESS; ListIterator iter; lock_slurmctld(job_read_lock); job_ptr = find_job_record(job_step_kill_msg->job_id); if (job_ptr && job_ptr->het_job_list && (job_step_kill_msg->signal == SIGKILL) && (job_step_kill_msg->job_step_id != SLURM_BATCH_SCRIPT)) { cnt = list_count(job_ptr->het_job_list); het_job_ids = xcalloc(cnt, sizeof(uint32_t)); i = 0; iter = list_iterator_create(job_ptr->het_job_list); while ((het_job_ptr = list_next(iter))) { het_job_ids[i++] = het_job_ptr->job_id; } list_iterator_destroy(iter); } unlock_slurmctld(job_read_lock); if (!job_ptr) { info("%s: invalid JobId=%u", __func__, job_step_kill_msg->job_id); error_code = ESLURM_INVALID_JOB_ID; } else if (het_job_ids) { for (i = 0; i < cnt; i++) { job_step_kill_msg->job_id = het_job_ids[i]; rc = _kill_job_step(job_step_kill_msg, uid); if (rc != SLURM_SUCCESS) error_code = rc; } xfree(het_job_ids); } else { error_code = _kill_job_step(job_step_kill_msg, uid); } return error_code; } /* * kill_job_by_part_name - Given a partition name, deallocate resource for * its jobs and kill them. All jobs associated with this partition * will have their partition pointer cleared. * IN part_name - name of a partition * RET number of jobs associated with this partition */ extern int kill_job_by_part_name(char *part_name) { ListIterator job_iterator, part_iterator; job_record_t *job_ptr; part_record_t *part_ptr, *part2_ptr; int kill_job_cnt = 0; time_t now = time(NULL); part_ptr = find_part_record (part_name); if (part_ptr == NULL) /* No such partition */ return 0; job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { bool pending = false, suspended = false; pending = IS_JOB_PENDING(job_ptr); if (job_ptr->part_ptr_list) { /* Remove partition if candidate for a job */ bool rebuild_name_list = false; part_iterator = list_iterator_create(job_ptr-> part_ptr_list); while ((part2_ptr = list_next(part_iterator))) { if (part2_ptr != part_ptr) continue; list_remove(part_iterator); rebuild_name_list = true; } list_iterator_destroy(part_iterator); if (rebuild_name_list) { if (list_count(job_ptr->part_ptr_list) > 0) { _rebuild_part_name_list(job_ptr); job_ptr->part_ptr = list_peek(job_ptr-> part_ptr_list); } else { FREE_NULL_LIST(job_ptr->part_ptr_list); } } } if (job_ptr->part_ptr != part_ptr) continue; if (IS_JOB_SUSPENDED(job_ptr)) { uint32_t suspend_job_state = job_ptr->job_state; /* we can't have it as suspended when we call the * accounting stuff. */ job_ptr->job_state = JOB_CANCELLED; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_ptr->job_state = suspend_job_state; suspended = true; } if (IS_JOB_RUNNING(job_ptr) || suspended) { kill_job_cnt++; info("Killing %pJ on defunct partition %s", job_ptr, part_name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; build_cg_bitmap(job_ptr); job_ptr->state_reason = FAIL_DOWN_PARTITION; xfree(job_ptr->state_desc); if (suspended) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); } else job_ptr->end_time = now; job_completion_logger(job_ptr, false); if (!pending) deallocate_nodes(job_ptr, false, suspended, false); } else if (pending) { kill_job_cnt++; info("Killing %pJ on defunct partition %s", job_ptr, part_name); job_ptr->job_state = JOB_CANCELLED; job_ptr->start_time = now; job_ptr->end_time = now; job_ptr->exit_code = 1; job_completion_logger(job_ptr, false); fed_mgr_job_complete(job_ptr, 0, now); } job_ptr->part_ptr = NULL; FREE_NULL_LIST(job_ptr->part_ptr_list); } list_iterator_destroy(job_iterator); if (kill_job_cnt) last_job_update = now; return kill_job_cnt; } /* * kill_job_by_front_end_name - Given a front end node name, deallocate * resource for its jobs and kill them. * IN node_name - name of a front end node * RET number of jobs associated with this front end node * NOTE: Patterned after kill_running_job_by_node_name() */ extern int kill_job_by_front_end_name(char *node_name) { #ifdef HAVE_FRONT_END ListIterator job_iterator; job_record_t *job_ptr, *het_job_leader; node_record_t *node_ptr; time_t now = time(NULL); int i, kill_job_cnt = 0; if (node_name == NULL) fatal("kill_job_by_front_end_name: node_name is NULL"); job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { bool suspended = false; if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) continue; het_job_leader = NULL; if (job_ptr->het_job_id) het_job_leader = find_job_record(job_ptr->het_job_id); if (!het_job_leader) het_job_leader = job_ptr; if ((het_job_leader->batch_host == NULL) || xstrcmp(het_job_leader->batch_host, node_name)) continue; /* no match on node name */ if (IS_JOB_SUSPENDED(job_ptr)) { uint32_t suspend_job_state = job_ptr->job_state; /* * we can't have it as suspended when we call the * accounting stuff. */ job_ptr->job_state = JOB_CANCELLED; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_ptr->job_state = suspend_job_state; suspended = true; } if (IS_JOB_COMPLETING(job_ptr)) { kill_job_cnt++; while ((i = bit_ffs(job_ptr->node_bitmap_cg)) >= 0) { bit_clear(job_ptr->node_bitmap_cg, i); if (job_ptr->node_cnt) (job_ptr->node_cnt)--; else { error("node_cnt underflow on %pJ", job_ptr); } job_update_tres_cnt(job_ptr, i); if (job_ptr->node_cnt == 0) { cleanup_completing(job_ptr); } node_ptr = &node_record_table_ptr[i]; if (node_ptr->comp_job_cnt) (node_ptr->comp_job_cnt)--; else { error("Node %s comp_job_cnt underflow, %pJ", node_ptr->name, job_ptr); } } } else if (IS_JOB_RUNNING(job_ptr) || suspended) { kill_job_cnt++; if (job_ptr->batch_flag && job_ptr->details && slurmctld_conf.job_requeue && (job_ptr->details->requeue > 0)) { char requeue_msg[128]; srun_node_fail(job_ptr, node_name); info("requeue %pJ due to failure of node %s", job_ptr, node_name); set_job_prio(job_ptr); snprintf(requeue_msg, sizeof(requeue_msg), "Job requeued due to failure " "of node %s", node_name); job_ptr->time_last_active = now; if (suspended) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, job_ptr-> suspend_time); } else job_ptr->end_time = now; /* * We want this job to look like it * was terminated in the accounting logs. * Set a new submit time so the restarted * job looks like a new job. */ job_ptr->job_state = JOB_NODE_FAIL; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, true); deallocate_nodes(job_ptr, false, suspended, false); /* do this after the epilog complete, * setting it here is too early */ //job_ptr->db_index = 0; //job_ptr->details->submit_time = now; job_ptr->job_state = JOB_PENDING; if (job_ptr->node_cnt) job_ptr->job_state |= JOB_COMPLETING; job_ptr->restart_cnt++; /* clear signal sent flag on requeue */ job_ptr->warn_flags &= ~WARN_SENT; /* Since the job completion logger * removes the submit we need to add it * again. */ acct_policy_add_job_submit(job_ptr); if (!job_ptr->node_bitmap_cg || bit_set_count(job_ptr->node_bitmap_cg) == 0) batch_requeue_fini(job_ptr); } else { info("Killing %pJ on failed node %s", job_ptr, node_name); srun_node_fail(job_ptr, node_name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; build_cg_bitmap(job_ptr); job_ptr->state_reason = FAIL_DOWN_NODE; xfree(job_ptr->state_desc); if (suspended) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); } else job_ptr->end_time = now; job_completion_logger(job_ptr, false); deallocate_nodes(job_ptr, false, suspended, false); } } } list_iterator_destroy(job_iterator); if (kill_job_cnt) last_job_update = now; return kill_job_cnt; #else return 0; #endif } /* * partition_in_use - determine whether a partition is in use by a RUNNING * PENDING or SUSPENDED job or reservations * IN part_name - name of a partition * RET true if the partition is in use, else false */ extern bool partition_in_use(char *part_name) { ListIterator job_iterator; job_record_t *job_ptr; part_record_t *part_ptr; part_ptr = find_part_record (part_name); if (part_ptr == NULL) /* No such partition */ return false; /* check jobs */ job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { if (job_ptr->part_ptr == part_ptr) { if (!IS_JOB_FINISHED(job_ptr)) { list_iterator_destroy(job_iterator); return true; } } } list_iterator_destroy(job_iterator); /* check reservations */ if (list_find_first(resv_list, _find_resv_part, part_ptr)) return true; return false; } /* Clear a job's GRES details per node strings, rebuilt later on demand */ static void _clear_job_gres_details(job_record_t *job_ptr) { int i; xfree(job_ptr->gres_used); for (i = 0; i < job_ptr->gres_detail_cnt; i++) xfree(job_ptr->gres_detail_str[i]); xfree(job_ptr->gres_detail_str); job_ptr->gres_detail_cnt = 0; } static bool _job_node_test(job_record_t *job_ptr, int node_inx) { if (job_ptr->node_bitmap && bit_test(job_ptr->node_bitmap, node_inx)) return true; return false; } static bool _het_job_on_node(job_record_t *job_ptr, int node_inx) { job_record_t *het_job_leader, *het_job; ListIterator iter; static bool result = false; if (!job_ptr->het_job_id) return _job_node_test(job_ptr, node_inx); het_job_leader = find_job_record(job_ptr->het_job_id); if (!het_job_leader) { error("%s: Hetjob leader %pJ not found", __func__, job_ptr); return _job_node_test(job_ptr, node_inx); } if (!het_job_leader->het_job_list) { error("%s: Hetjob leader %pJ job list is NULL", __func__, job_ptr); return _job_node_test(job_ptr, node_inx); } iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) { if ((result = _job_node_test(het_job, node_inx))) break; /* * After a DOWN node is removed from another job component, * we have no way to identify other hetjob components with * the same node, so assume if one component is in NODE_FAILED * state, they all should be. */ if (IS_JOB_NODE_FAILED(het_job)) { result = true; break; } } list_iterator_destroy(iter); return result; } /* * kill_running_job_by_node_name - Given a node name, deallocate RUNNING * or COMPLETING jobs from the node or kill them * IN node_name - name of a node * RET number of killed jobs */ extern int kill_running_job_by_node_name(char *node_name) { ListIterator job_iterator; job_record_t *job_ptr; node_record_t *node_ptr; int node_inx; int kill_job_cnt = 0; time_t now = time(NULL); xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); xassert(verify_lock(NODE_LOCK, WRITE_LOCK)); node_ptr = find_node_record(node_name); if (node_ptr == NULL) /* No such node */ return 0; node_inx = node_ptr - node_record_table_ptr; job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { bool suspended = false; if (!_het_job_on_node(job_ptr, node_inx)) continue; /* job not on this node */ if (nonstop_ops.node_fail) (nonstop_ops.node_fail)(job_ptr, node_ptr); if (IS_JOB_SUSPENDED(job_ptr)) { uint32_t suspend_job_state = job_ptr->job_state; /* * we can't have it as suspended when we call the * accounting stuff. */ job_ptr->job_state = JOB_CANCELLED; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_ptr->job_state = suspend_job_state; suspended = true; } if (IS_JOB_COMPLETING(job_ptr)) { if (!bit_test(job_ptr->node_bitmap_cg, node_inx)) continue; kill_job_cnt++; bit_clear(job_ptr->node_bitmap_cg, node_inx); job_update_tres_cnt(job_ptr, node_inx); if (job_ptr->node_cnt) (job_ptr->node_cnt)--; else { error("node_cnt underflow on %pJ", job_ptr); } if (job_ptr->node_cnt == 0) cleanup_completing(job_ptr); if (node_ptr->comp_job_cnt) (node_ptr->comp_job_cnt)--; else { error("Node %s comp_job_cnt underflow, %pJ", node_ptr->name, job_ptr); } } else if (IS_JOB_RUNNING(job_ptr) || suspended) { kill_job_cnt++; if ((job_ptr->details) && (job_ptr->kill_on_node_fail == 0) && (job_ptr->node_cnt > 1) && !IS_JOB_CONFIGURING(job_ptr)) { /* keep job running on remaining nodes */ srun_node_fail(job_ptr, node_name); error("Removing failed node %s from %pJ", node_name, job_ptr); job_pre_resize_acctg(job_ptr); kill_step_on_node(job_ptr, node_ptr, true); excise_node_from_job(job_ptr, node_ptr); (void) gs_job_start(job_ptr); gres_build_job_details(job_ptr->gres_list, &job_ptr->gres_detail_cnt, &job_ptr->gres_detail_str, &job_ptr->gres_used); job_post_resize_acctg(job_ptr); } else if (job_ptr->batch_flag && job_ptr->details && job_ptr->details->requeue) { char requeue_msg[128]; srun_node_fail(job_ptr, node_name); info("requeue job %pJ due to failure of node %s", job_ptr, node_name); snprintf(requeue_msg, sizeof(requeue_msg), "Job requeued due to failure " "of node %s", node_name); job_ptr->time_last_active = now; if (suspended) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, job_ptr-> suspend_time); } else job_ptr->end_time = now; /* * We want this job to look like it * was terminated in the accounting logs. * Set a new submit time so the restarted * job looks like a new job. */ job_ptr->job_state = JOB_NODE_FAIL; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, true); deallocate_nodes(job_ptr, false, suspended, false); /* do this after the epilog complete, * setting it here is too early */ //job_ptr->db_index = 0; //job_ptr->details->submit_time = now; job_ptr->job_state = JOB_PENDING; if (job_ptr->node_cnt) job_ptr->job_state |= JOB_COMPLETING; job_ptr->restart_cnt++; /* clear signal sent flag on requeue */ job_ptr->warn_flags &= ~WARN_SENT; /* * Since the job completion logger * removes the submit we need to add it * again. */ acct_policy_add_job_submit(job_ptr); if (!job_ptr->node_bitmap_cg || bit_set_count(job_ptr->node_bitmap_cg) == 0) batch_requeue_fini(job_ptr); } else { info("Killing %pJ on failed node %s", job_ptr, node_name); srun_node_fail(job_ptr, node_name); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; build_cg_bitmap(job_ptr); job_ptr->state_reason = FAIL_DOWN_NODE; xfree(job_ptr->state_desc); if (suspended) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); } else job_ptr->end_time = now; job_completion_logger(job_ptr, false); deallocate_nodes(job_ptr, false, suspended, false); } } } list_iterator_destroy(job_iterator); if (kill_job_cnt) last_job_update = now; return kill_job_cnt; } /* Remove one node from a job's allocation */ extern void excise_node_from_job(job_record_t *job_ptr, node_record_t *node_ptr) { int i, i_first, i_last, orig_pos = -1, new_pos = -1; bitstr_t *orig_bitmap; orig_bitmap = bit_copy(job_ptr->node_bitmap); make_node_idle(node_ptr, job_ptr); /* updates bitmap */ xfree(job_ptr->nodes); job_ptr->nodes = bitmap2node_name(job_ptr->node_bitmap); i_first = bit_ffs(orig_bitmap); if (i_first >= 0) i_last = bit_fls(orig_bitmap); else i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(orig_bitmap,i)) continue; orig_pos++; if (!bit_test(job_ptr->node_bitmap, i)) continue; new_pos++; if (orig_pos == new_pos) continue; memcpy(&job_ptr->node_addr[new_pos], &job_ptr->node_addr[orig_pos], sizeof(slurm_addr_t)); /* * NOTE: The job's allocation in the job_ptr->job_resrcs * data structure is unchanged even after a node allocated * to the job goes DOWN. */ } job_ptr->total_nodes = job_ptr->node_cnt = new_pos + 1; FREE_NULL_BITMAP(orig_bitmap); (void) select_g_job_resized(job_ptr, node_ptr); } /* * dump_job_desc - dump the incoming job submit request message * IN job_specs - job specification from RPC */ void dump_job_desc(job_desc_msg_t * job_specs) { long pn_min_cpus, pn_min_tmp_disk, min_cpus; uint64_t pn_min_memory; long time_limit, priority, contiguous, nice, time_min; long kill_on_node_fail, shared, immediate, wait_all_nodes; long cpus_per_task, requeue, num_tasks, overcommit; long ntasks_per_node, ntasks_per_socket, ntasks_per_core; int spec_count; char *mem_type, buf[100], *signal_flags, *spec_type, *job_id; if (get_log_level() < LOG_LEVEL_DEBUG3) return; if (job_specs == NULL) return; if (job_specs->job_id_str) job_id = job_specs->job_id_str; else if (job_specs->job_id == NO_VAL) job_id = "N/A"; else { snprintf(buf, sizeof(buf), "%u", job_specs->job_id); job_id = buf; } debug3("JobDesc: user_id=%u JobId=%s partition=%s name=%s", job_specs->user_id, job_id, job_specs->partition, job_specs->name); min_cpus = (job_specs->min_cpus != NO_VAL) ? (long) job_specs->min_cpus : -1L; pn_min_cpus = (job_specs->pn_min_cpus != NO_VAL16) ? (long) job_specs->pn_min_cpus : -1L; if (job_specs->core_spec == NO_VAL16) { spec_type = "core"; spec_count = -1; } else if (job_specs->core_spec & CORE_SPEC_THREAD) { spec_type = "thread"; spec_count = job_specs->core_spec & (~CORE_SPEC_THREAD); } else { spec_type = "core"; spec_count = job_specs->core_spec; } debug3(" cpus=%ld-%u pn_min_cpus=%ld %s_spec=%d", min_cpus, job_specs->max_cpus, pn_min_cpus, spec_type, spec_count); debug3(" Nodes=%u-[%u] Sock/Node=%u Core/Sock=%u Thread/Core=%u", job_specs->min_nodes, job_specs->max_nodes, job_specs->sockets_per_node, job_specs->cores_per_socket, job_specs->threads_per_core); if (job_specs->pn_min_memory == NO_VAL64) { pn_min_memory = -1L; mem_type = "job"; } else if (job_specs->pn_min_memory & MEM_PER_CPU) { pn_min_memory = job_specs->pn_min_memory & (~MEM_PER_CPU); mem_type = "cpu"; } else { pn_min_memory = job_specs->pn_min_memory; mem_type = "job"; } pn_min_tmp_disk = (job_specs->pn_min_tmp_disk != NO_VAL) ? (long) job_specs->pn_min_tmp_disk : -1L; debug3(" pn_min_memory_%s=%"PRIu64" pn_min_tmp_disk=%ld", mem_type, pn_min_memory, pn_min_tmp_disk); immediate = (job_specs->immediate == 0) ? 0L : 1L; debug3(" immediate=%ld reservation=%s", immediate, job_specs->reservation); debug3(" features=%s batch_features=%s cluster_features=%s", job_specs->features, job_specs->batch_features, job_specs->cluster_features); debug3(" req_nodes=%s exc_nodes=%s", job_specs->req_nodes, job_specs->exc_nodes); time_limit = (job_specs->time_limit != NO_VAL) ? (long) job_specs->time_limit : -1L; time_min = (job_specs->time_min != NO_VAL) ? (long) job_specs->time_min : time_limit; priority = (job_specs->priority != NO_VAL) ? (long) job_specs->priority : -1L; contiguous = (job_specs->contiguous != NO_VAL16) ? (long) job_specs->contiguous : -1L; shared = (job_specs->shared != NO_VAL16) ? (long) job_specs->shared : -1L; debug3(" time_limit=%ld-%ld priority=%ld contiguous=%ld shared=%ld", time_min, time_limit, priority, contiguous, shared); kill_on_node_fail = (job_specs->kill_on_node_fail != NO_VAL16) ? (long) job_specs->kill_on_node_fail : -1L; if (job_specs->script) /* log has problem with string len & null */ debug3(" kill_on_node_fail=%ld script=%.40s...", kill_on_node_fail, job_specs->script); else debug3(" kill_on_node_fail=%ld script=(null)", kill_on_node_fail); if (job_specs->argc == 1) debug3(" argv=\"%s\"", job_specs->argv[0]); else if (job_specs->argc == 2) debug3(" argv=%s,%s", job_specs->argv[0], job_specs->argv[1]); else if (job_specs->argc > 2) debug3(" argv=%s,%s,%s,...", job_specs->argv[0], job_specs->argv[1], job_specs->argv[2]); if (job_specs->env_size == 1) debug3(" environment=\"%s\"", job_specs->environment[0]); else if (job_specs->env_size == 2) debug3(" environment=%s,%s", job_specs->environment[0], job_specs->environment[1]); else if (job_specs->env_size > 2) debug3(" environment=%s,%s,%s,...", job_specs->environment[0], job_specs->environment[1], job_specs->environment[2]); if (job_specs->spank_job_env_size == 1) debug3(" spank_job_env=\"%s\"", job_specs->spank_job_env[0]); else if (job_specs->spank_job_env_size == 2) debug3(" spank_job_env=%s,%s", job_specs->spank_job_env[0], job_specs->spank_job_env[1]); else if (job_specs->spank_job_env_size > 2) debug3(" spank_job_env=%s,%s,%s,...", job_specs->spank_job_env[0], job_specs->spank_job_env[1], job_specs->spank_job_env[2]); debug3(" stdin=%s stdout=%s stderr=%s", job_specs->std_in, job_specs->std_out, job_specs->std_err); debug3(" work_dir=%s alloc_node:sid=%s:%u", job_specs->work_dir, job_specs->alloc_node, job_specs->alloc_sid); debug3(" power_flags=%s", power_flags_str(job_specs->power_flags)); debug3(" resp_host=%s alloc_resp_port=%u other_port=%u", job_specs->resp_host, job_specs->alloc_resp_port, job_specs->other_port); debug3(" dependency=%s account=%s qos=%s comment=%s", job_specs->dependency, job_specs->account, job_specs->qos, job_specs->comment); num_tasks = (job_specs->num_tasks != NO_VAL) ? (long) job_specs->num_tasks : -1L; overcommit = (job_specs->overcommit != NO_VAL8) ? (long) job_specs->overcommit : -1L; nice = (job_specs->nice != NO_VAL) ? ((int64_t)job_specs->nice - NICE_OFFSET) : 0; debug3(" mail_type=%u mail_user=%s nice=%ld num_tasks=%ld " "open_mode=%u overcommit=%ld acctg_freq=%s", job_specs->mail_type, job_specs->mail_user, nice, num_tasks, job_specs->open_mode, overcommit, job_specs->acctg_freq); slurm_make_time_str(&job_specs->begin_time, buf, sizeof(buf)); cpus_per_task = (job_specs->cpus_per_task != NO_VAL16) ? (long) job_specs->cpus_per_task : -1L; requeue = (job_specs->requeue != NO_VAL16) ? (long) job_specs->requeue : -1L; debug3(" network=%s begin=%s cpus_per_task=%ld requeue=%ld " "licenses=%s", job_specs->network, buf, cpus_per_task, requeue, job_specs->licenses); slurm_make_time_str(&job_specs->end_time, buf, sizeof(buf)); wait_all_nodes = (job_specs->wait_all_nodes != NO_VAL16) ? (long) job_specs->wait_all_nodes : -1L; if (job_specs->warn_flags & KILL_JOB_BATCH) signal_flags = "B:"; else signal_flags = ""; cpu_freq_debug(NULL, NULL, buf, sizeof(buf), job_specs->cpu_freq_gov, job_specs->cpu_freq_min, job_specs->cpu_freq_max, NO_VAL); debug3(" end_time=%s signal=%s%u@%u wait_all_nodes=%ld cpu_freq=%s", buf, signal_flags, job_specs->warn_signal, job_specs->warn_time, wait_all_nodes, buf); ntasks_per_node = (job_specs->ntasks_per_node != NO_VAL16) ? (long) job_specs->ntasks_per_node : -1L; ntasks_per_socket = (job_specs->ntasks_per_socket != NO_VAL16) ? (long) job_specs->ntasks_per_socket : -1L; ntasks_per_core = (job_specs->ntasks_per_core != NO_VAL16) ? (long) job_specs->ntasks_per_core : -1L; debug3(" ntasks_per_node=%ld ntasks_per_socket=%ld " "ntasks_per_core=%ld", ntasks_per_node, ntasks_per_socket, ntasks_per_core); debug3(" mem_bind=%u:%s plane_size:%u", job_specs->mem_bind_type, job_specs->mem_bind, job_specs->plane_size); debug3(" array_inx=%s", job_specs->array_inx); debug3(" burst_buffer=%s", job_specs->burst_buffer); debug3(" mcs_label=%s", job_specs->mcs_label); slurm_make_time_str(&job_specs->deadline, buf, sizeof(buf)); debug3(" deadline=%s", buf); debug3(" bitflags=%u delay_boot=%u", job_specs->bitflags, job_specs->delay_boot); if (job_specs->cpus_per_tres) debug3(" CPUs_per_TRES=%s", job_specs->cpus_per_tres); if (job_specs->mem_per_tres) debug3(" Mem_per_TRES=%s", job_specs->mem_per_tres); if (job_specs->tres_bind) debug3(" TRES_bind=%s", job_specs->tres_bind); if (job_specs->tres_freq) debug3(" TRES_freq=%s", job_specs->tres_freq); if (job_specs->tres_per_job) debug3(" TRES_per_job=%s", job_specs->tres_per_job); if (job_specs->tres_per_node) debug3(" TRES_per_node=%s", job_specs->tres_per_node); if (job_specs->tres_per_socket) debug3(" TRES_per_socket=%s", job_specs->tres_per_socket); if (job_specs->tres_per_task) debug3(" TRES_per_task=%s", job_specs->tres_per_task); select_g_select_jobinfo_sprint(job_specs->select_jobinfo, buf, sizeof(buf), SELECT_PRINT_MIXED); if (buf[0] != '\0') debug3(" %s", buf); } /* * init_job_conf - initialize the job configuration tables and values. * this should be called after creating node information, but * before creating any job entries. Pre-existing job entries are * left unchanged. * NOTE: The job hash table size does not change after initial creation. * RET 0 if no error, otherwise an error code * global: last_job_update - time of last job table update * job_list - pointer to global job list */ int init_job_conf(void) { if (job_list == NULL) { job_count = 0; job_list = list_create(_list_delete_job); } last_job_update = time(NULL); if (!purge_files_list) { purge_files_list = list_create(xfree_ptr); } return SLURM_SUCCESS; } /* * rehash_jobs - Create or rebuild the job hash table. */ extern void rehash_jobs(void) { xassert(verify_lock(CONF_LOCK, READ_LOCK)); xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); if (job_hash == NULL) { hash_table_size = slurmctld_conf.max_job_cnt; job_hash = xcalloc(hash_table_size, sizeof(job_record_t *)); job_array_hash_j = xcalloc(hash_table_size, sizeof(job_record_t *)); job_array_hash_t = xcalloc(hash_table_size, sizeof(job_record_t *)); } else if (hash_table_size < (slurmctld_conf.max_job_cnt / 2)) { /* If the MaxJobCount grows by too much, the hash table will * be ineffective without rebuilding. We don't presently bother * to rebuild the hash table, but cut MaxJobCount back as * needed. */ error ("MaxJobCount reset too high, restart slurmctld"); slurmctld_conf.max_job_cnt = hash_table_size; } } /* Create an exact copy of an existing job record for a job array. * IN job_ptr - META job record for a job array, which is to become an * individial task of the job array. * Set the job's array_task_id to the task to be split out. * RET - The new job record, which is the new META job record. */ extern job_record_t *job_array_split(job_record_t *job_ptr) { job_record_t *job_ptr_pend = NULL, *save_job_next; struct job_details *job_details, *details_new, *save_details; uint32_t save_job_id; uint64_t save_db_index = job_ptr->db_index; priority_factors_object_t *save_prio_factors; List save_step_list; int i; job_ptr_pend = _create_job_record(0); if (!job_ptr_pend) return NULL; _remove_job_hash(job_ptr, JOB_HASH_JOB); job_ptr_pend->job_id = job_ptr->job_id; if (_set_job_id(job_ptr) != SLURM_SUCCESS) fatal("%s: _set_job_id error", __func__); if (!job_ptr->array_recs) { fatal_abort("%s: %pJ record lacks array structure", __func__, job_ptr); } /* * Copy most of original job data. * This could be done in parallel, but performance was worse. */ save_job_id = job_ptr_pend->job_id; save_job_next = job_ptr_pend->job_next; save_details = job_ptr_pend->details; save_prio_factors = job_ptr_pend->prio_factors; save_step_list = job_ptr_pend->step_list; memcpy(job_ptr_pend, job_ptr, sizeof(job_record_t)); job_ptr_pend->job_id = save_job_id; job_ptr_pend->job_next = save_job_next; job_ptr_pend->details = save_details; job_ptr_pend->db_flags = 0; job_ptr_pend->step_list = save_step_list; job_ptr_pend->db_index = save_db_index; job_ptr_pend->prio_factors = save_prio_factors; slurm_copy_priority_factors_object(job_ptr_pend->prio_factors, job_ptr->prio_factors); job_ptr_pend->account = xstrdup(job_ptr->account); job_ptr_pend->admin_comment = xstrdup(job_ptr->admin_comment); job_ptr_pend->alias_list = xstrdup(job_ptr->alias_list); job_ptr_pend->alloc_node = xstrdup(job_ptr->alloc_node); job_ptr_pend->array_recs = job_ptr->array_recs; job_ptr->array_recs = NULL; if (job_ptr_pend->array_recs && job_ptr_pend->array_recs->task_id_bitmap) { bit_clear(job_ptr_pend->array_recs->task_id_bitmap, job_ptr_pend->array_task_id); } xfree(job_ptr_pend->array_recs->task_id_str); if (job_ptr_pend->array_recs->task_cnt) { job_ptr_pend->array_recs->task_cnt--; } else { error("%pJ array_recs->task_cnt underflow", job_ptr); } job_ptr_pend->array_task_id = NO_VAL; job_ptr_pend->batch_host = NULL; job_ptr_pend->burst_buffer = xstrdup(job_ptr->burst_buffer); job_ptr_pend->burst_buffer_state = xstrdup(job_ptr->burst_buffer_state); job_ptr_pend->clusters = xstrdup(job_ptr->clusters); job_ptr_pend->comment = xstrdup(job_ptr->comment); job_ptr_pend->fed_details = _dup_job_fed_details(job_ptr->fed_details); job_ptr_pend->front_end_ptr = NULL; /* struct job_details *details; *** NOTE: Copied below */ if (job_ptr->gres_list) { job_ptr_pend->gres_list = gres_plugin_job_state_dup(job_ptr->gres_list); } job_ptr_pend->gres_detail_cnt = 0; job_ptr_pend->gres_detail_str = NULL; job_ptr_pend->gres_alloc = NULL; job_ptr_pend->gres_req = NULL; job_ptr_pend->gres_used = NULL; job_ptr_pend->limit_set.tres = xcalloc(slurmctld_tres_cnt, sizeof(uint16_t)); memcpy(job_ptr_pend->limit_set.tres, job_ptr->limit_set.tres, sizeof(uint16_t) * slurmctld_tres_cnt); _add_job_hash(job_ptr); /* Sets job_next */ _add_job_hash(job_ptr_pend); /* Sets job_next */ _add_job_array_hash(job_ptr); job_ptr_pend->job_resrcs = NULL; job_ptr_pend->licenses = xstrdup(job_ptr->licenses); job_ptr_pend->license_list = license_job_copy(job_ptr->license_list); job_ptr_pend->mail_user = xstrdup(job_ptr->mail_user); job_ptr_pend->mcs_label = xstrdup(job_ptr->mcs_label); job_ptr_pend->name = xstrdup(job_ptr->name); job_ptr_pend->network = xstrdup(job_ptr->network); job_ptr_pend->node_addr = NULL; job_ptr_pend->node_bitmap = NULL; job_ptr_pend->node_bitmap_cg = NULL; job_ptr_pend->nodes = NULL; job_ptr_pend->nodes_completing = NULL; job_ptr_pend->partition = xstrdup(job_ptr->partition); job_ptr_pend->part_ptr_list = part_list_copy(job_ptr->part_ptr_list); /* On jobs that are held the priority_array isn't set up yet, * so check to see if it exists before copying. */ if (job_ptr->part_ptr_list && job_ptr->priority_array) { i = list_count(job_ptr->part_ptr_list) * sizeof(uint32_t); job_ptr_pend->priority_array = xmalloc(i); memcpy(job_ptr_pend->priority_array, job_ptr->priority_array, i); } job_ptr_pend->resv_name = xstrdup(job_ptr->resv_name); job_ptr_pend->resp_host = xstrdup(job_ptr->resp_host); if (job_ptr->select_jobinfo) { job_ptr_pend->select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); } job_ptr_pend->sched_nodes = NULL; if (job_ptr->spank_job_env_size) { job_ptr_pend->spank_job_env = xcalloc((job_ptr->spank_job_env_size + 1), sizeof(char *)); for (i = 0; i < job_ptr->spank_job_env_size; i++) { job_ptr_pend->spank_job_env[i] = xstrdup(job_ptr->spank_job_env[i]); } } job_ptr_pend->state_desc = xstrdup(job_ptr->state_desc); job_ptr_pend->system_comment = xstrdup(job_ptr->system_comment); i = sizeof(uint64_t) * slurmctld_tres_cnt; job_ptr_pend->tres_req_cnt = xmalloc(i); memcpy(job_ptr_pend->tres_req_cnt, job_ptr->tres_req_cnt, i); job_ptr_pend->tres_req_str = xstrdup(job_ptr->tres_req_str); job_ptr_pend->tres_fmt_req_str = xstrdup(job_ptr->tres_fmt_req_str); job_ptr_pend->tres_alloc_str = NULL; job_ptr_pend->tres_fmt_alloc_str = NULL; job_ptr_pend->tres_alloc_cnt = NULL; job_ptr_pend->cpus_per_tres = xstrdup(job_ptr->cpus_per_tres); job_ptr_pend->mem_per_tres = xstrdup(job_ptr->mem_per_tres); job_ptr_pend->tres_bind = xstrdup(job_ptr->tres_bind); job_ptr_pend->tres_freq = xstrdup(job_ptr->tres_freq); job_ptr_pend->tres_per_job = xstrdup(job_ptr->tres_per_job); job_ptr_pend->tres_per_node = xstrdup(job_ptr->tres_per_node); job_ptr_pend->tres_per_socket = xstrdup(job_ptr->tres_per_socket); job_ptr_pend->tres_per_task = xstrdup(job_ptr->tres_per_task); job_ptr_pend->user_name = xstrdup(job_ptr->user_name); job_ptr_pend->wckey = xstrdup(job_ptr->wckey); job_ptr_pend->deadline = job_ptr->deadline; job_details = job_ptr->details; details_new = job_ptr_pend->details; memcpy(details_new, job_details, sizeof(struct job_details)); /* * Reset the preempt_start_time or high priority array jobs will hang * for a period before preempting more jobs. */ details_new->preempt_start_time = 0; details_new->acctg_freq = xstrdup(job_details->acctg_freq); if (job_details->argc) { details_new->argv = xcalloc((job_details->argc + 1), sizeof(char *)); for (i = 0; i < job_details->argc; i++) { details_new->argv[i] = xstrdup(job_details->argv[i]); } } details_new->cpu_bind = xstrdup(job_details->cpu_bind); details_new->cpu_bind_type = job_details->cpu_bind_type; details_new->cpu_freq_min = job_details->cpu_freq_min; details_new->cpu_freq_max = job_details->cpu_freq_max; details_new->cpu_freq_gov = job_details->cpu_freq_gov; details_new->depend_list = depended_list_copy(job_details->depend_list); details_new->dependency = xstrdup(job_details->dependency); details_new->orig_dependency = xstrdup(job_details->orig_dependency); if (job_details->env_cnt) { details_new->env_sup = xcalloc((job_details->env_cnt + 1), sizeof(char *)); for (i = 0; i < job_details->env_cnt; i++) { details_new->env_sup[i] = xstrdup(job_details->env_sup[i]); } } if (job_details->exc_node_bitmap) { details_new->exc_node_bitmap = bit_copy(job_details->exc_node_bitmap); } details_new->exc_nodes = xstrdup(job_details->exc_nodes); details_new->feature_list = feature_list_copy(job_details->feature_list); details_new->features = xstrdup(job_details->features); details_new->cluster_features = xstrdup(job_details->cluster_features); if (job_details->mc_ptr) { i = sizeof(multi_core_data_t); details_new->mc_ptr = xmalloc(i); memcpy(details_new->mc_ptr, job_details->mc_ptr, i); } details_new->mem_bind = xstrdup(job_details->mem_bind); details_new->mem_bind_type = job_details->mem_bind_type; if (job_details->req_node_bitmap) { details_new->req_node_bitmap = bit_copy(job_details->req_node_bitmap); } details_new->req_nodes = xstrdup(job_details->req_nodes); details_new->std_err = xstrdup(job_details->std_err); details_new->std_in = xstrdup(job_details->std_in); details_new->std_out = xstrdup(job_details->std_out); details_new->work_dir = xstrdup(job_details->work_dir); details_new->x11_magic_cookie = xstrdup(job_details->x11_magic_cookie); if (job_ptr->fed_details) { add_fed_job_info(job_ptr); /* * The new (split) job needs its remote dependencies tested * separately from just the meta job, so send remote * dependencies to siblings if needed. */ if (job_ptr->details->dependency && job_ptr->details->depend_list) fed_mgr_submit_remote_dependencies(job_ptr, false, false); } return job_ptr_pend; } /* Add job array data stucture to the job record */ static void _create_job_array(job_record_t *job_ptr, job_desc_msg_t *job_specs) { struct job_details *details; char *sep = NULL; int max_run_tasks, min_task_id, max_task_id, step_task_id = 1, task_cnt; uint32_t i_cnt; if (!job_specs->array_bitmap) return; i_cnt = bit_set_count(job_specs->array_bitmap); if (i_cnt == 0) { info("%s: %pJ array_bitmap is empty", __func__, job_ptr); return; } job_ptr->array_job_id = job_ptr->job_id; job_ptr->array_recs = xmalloc(sizeof(job_array_struct_t)); min_task_id = bit_ffs(job_specs->array_bitmap); max_task_id = bit_fls(job_specs->array_bitmap); task_cnt = bit_set_count(job_specs->array_bitmap); i_cnt = max_task_id + 1; job_specs->array_bitmap = bit_realloc(job_specs->array_bitmap, i_cnt); job_ptr->array_recs->task_id_bitmap = job_specs->array_bitmap; job_specs->array_bitmap = NULL; job_ptr->array_recs->task_cnt = bit_set_count(job_ptr->array_recs->task_id_bitmap); if (job_ptr->array_recs->task_cnt > 1) job_count += (job_ptr->array_recs->task_cnt - 1); if (job_specs->array_inx) sep = strchr(job_specs->array_inx, '%'); if (sep) { max_run_tasks = atoi(sep + 1); if (max_run_tasks > 0) job_ptr->array_recs->max_run_tasks = max_run_tasks; } details = job_ptr->details; if (details) { if (job_specs->array_inx) { sep = strchr(job_specs->array_inx, ':'); if (sep) step_task_id = atoi(sep + 1); } details->env_sup = xrealloc(details->env_sup, (sizeof(char *) * (details->env_cnt + 4))); xstrfmtcat(details->env_sup[details->env_cnt++], "SLURM_ARRAY_TASK_COUNT=%d", task_cnt); xstrfmtcat(details->env_sup[details->env_cnt++], "SLURM_ARRAY_TASK_MIN=%d", min_task_id); xstrfmtcat(details->env_sup[details->env_cnt++], "SLURM_ARRAY_TASK_MAX=%d", max_task_id); xstrfmtcat(details->env_sup[details->env_cnt++], "SLURM_ARRAY_TASK_STEP=%d", step_task_id); } } /* * Wrapper for select_nodes() function that will test all valid partitions * for a new job * IN job_ptr - pointer to the job record * IN test_only - if set do not allocate nodes, just confirm they * could be allocated now * IN select_node_bitmap - bitmap of nodes to be used for the * job's resource allocation (not returned if NULL), caller * must free * OUT err_msg - error message for job, caller must xfree */ static int _select_nodes_parts(job_record_t *job_ptr, bool test_only, bitstr_t **select_node_bitmap, char **err_msg) { part_record_t *part_ptr; ListIterator iter; int rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; int best_rc = -1, part_limits_rc = WAIT_NO_REASON; bitstr_t *save_avail_node_bitmap = NULL; save_avail_node_bitmap = bit_copy(avail_node_bitmap); bit_or(avail_node_bitmap, rs_node_bitmap); if (job_ptr->part_ptr_list) { list_sort(job_ptr->part_ptr_list, priority_sort_part_tier); iter = list_iterator_create(job_ptr->part_ptr_list); while ((part_ptr = list_next(iter))) { job_ptr->part_ptr = part_ptr; debug2("Try %pJ on next partition %s", job_ptr, part_ptr->name); part_limits_rc = job_limits_check(&job_ptr, false); if ((part_limits_rc != WAIT_NO_REASON) && (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ANY)) continue; if ((part_limits_rc != WAIT_NO_REASON) && (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ALL)) { if (part_limits_rc != WAIT_PART_DOWN) { best_rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; break; } else { best_rc = ESLURM_PARTITION_DOWN; } } if (part_limits_rc == WAIT_NO_REASON) { rc = select_nodes(job_ptr, test_only, select_node_bitmap, err_msg, true, SLURMDB_JOB_FLAG_SUBMIT); } else { rc = select_nodes(job_ptr, true, select_node_bitmap, err_msg, true, SLURMDB_JOB_FLAG_SUBMIT); if ((rc == SLURM_SUCCESS) && (part_limits_rc == WAIT_PART_DOWN)) rc = ESLURM_PARTITION_DOWN; } if ((rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ALL)) { best_rc = rc; /* Job can not run */ break; } if ((rc != ESLURM_REQUESTED_NODE_CONFIG_UNAVAILABLE) && (rc != ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && (rc != ESLURM_RESERVATION_BUSY) && (rc != ESLURM_NODES_BUSY)) { best_rc = rc; /* Job can run now */ if ((slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ANY) || (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_NONE) || (!test_only && (part_limits_rc == WAIT_NO_REASON))) { break; } } if (((rc == ESLURM_NODES_BUSY) || (rc == ESLURM_RESERVATION_BUSY)) && (best_rc == -1) && ((slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ANY) || (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_NONE))) { if (test_only) break; best_rc = rc; /* Keep looking for partition * where job can start now */ } if ((job_ptr->preempt_in_progress) && (rc != ESLURM_NODES_BUSY)) { /* Already started preempting jobs, don't * consider starting this job in another * partition as we iterator over others. */ test_only = true; } } list_iterator_destroy(iter); if (best_rc != -1) rc = best_rc; else if (part_limits_rc == WAIT_PART_DOWN) rc = ESLURM_PARTITION_DOWN; } else { part_limits_rc = job_limits_check(&job_ptr, false); if (part_limits_rc == WAIT_NO_REASON) { rc = select_nodes(job_ptr, test_only, select_node_bitmap, err_msg, true, SLURMDB_JOB_FLAG_SUBMIT); } else if (part_limits_rc == WAIT_PART_DOWN) { rc = select_nodes(job_ptr, true, select_node_bitmap, err_msg, true, SLURMDB_JOB_FLAG_SUBMIT); if (rc == SLURM_SUCCESS) rc = ESLURM_PARTITION_DOWN; } } if (rc == ESLURM_NODES_BUSY) job_ptr->state_reason = WAIT_RESOURCES; else if ((rc == ESLURM_RESERVATION_BUSY) || (rc == ESLURM_RESERVATION_NOT_USABLE)) job_ptr->state_reason = WAIT_RESERVATION; else if (rc == ESLURM_JOB_HELD) /* Do not reset the state_reason field here. select_nodes() * already set the state_reason field, and this error code * does not distinguish between user and admin holds. */ ; else if (rc == ESLURM_NODE_NOT_AVAIL) job_ptr->state_reason = WAIT_NODE_NOT_AVAIL; else if (rc == ESLURM_QOS_THRES) job_ptr->state_reason = WAIT_QOS_THRES; else if (rc == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) job_ptr->state_reason = WAIT_PART_CONFIG; else if (rc == ESLURM_POWER_NOT_AVAIL) job_ptr->state_reason = WAIT_POWER_NOT_AVAIL; else if (rc == ESLURM_BURST_BUFFER_WAIT) job_ptr->state_reason = WAIT_BURST_BUFFER_RESOURCE; else if (rc == ESLURM_POWER_RESERVED) job_ptr->state_reason = WAIT_POWER_RESERVED; else if (rc == ESLURM_PARTITION_DOWN) job_ptr->state_reason = WAIT_PART_DOWN; else if (rc == ESLURM_INVALID_QOS) job_ptr->state_reason = FAIL_QOS; else if (rc == ESLURM_INVALID_ACCOUNT) job_ptr->state_reason = FAIL_ACCOUNT; FREE_NULL_BITMAP(avail_node_bitmap); avail_node_bitmap = save_avail_node_bitmap; return rc; } static inline bool _has_deadline(job_record_t *job_ptr) { if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) { queue_job_scheduler(); return true; } return false; } /* * job_allocate - create job_records for the supplied job specification and * allocate nodes for it. * IN job_specs - job specifications * IN immediate - if set then either initiate the job immediately or fail * IN will_run - don't initiate the job if set, just test if it could run * now or later * OUT resp - will run response (includes start location, time, etc.) * IN allocate - resource allocation request only if set, batch job if zero * IN submit_uid -uid of user issuing the request * OUT job_pptr - set to pointer to job record * OUT err_msg - Custom error message to the user, caller to xfree results * IN protocol_version - version of the code the caller is using * RET 0 or an error code. If the job would only be able to execute with * some change in partition configuration then * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned * globals: job_list - pointer to global job list * list_part - global list of partition info * default_part_loc - pointer to default partition */ extern int job_allocate(job_desc_msg_t * job_specs, int immediate, int will_run, will_run_response_msg_t **resp, int allocate, uid_t submit_uid, job_record_t **job_pptr, char **err_msg, uint16_t protocol_version) { static time_t sched_update = 0; static int defer_sched = 0; char *sched_params, *tmp_ptr; int error_code, i; bool no_alloc, top_prio, test_only, too_fragmented, independent; job_record_t *job_ptr; time_t now = time(NULL); bool held_user = false; xassert(verify_lock(CONF_LOCK, READ_LOCK)); xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); xassert(verify_lock(NODE_LOCK, WRITE_LOCK)); xassert(verify_lock(PART_LOCK, READ_LOCK)); if (sched_update != slurmctld_conf.last_update) { sched_update = slurmctld_conf.last_update; sched_params = slurm_get_sched_params(); if (xstrcasestr(sched_params, "defer")) defer_sched = 1; else defer_sched = 0; if ((tmp_ptr = xstrcasestr(sched_params, "delay_boot="))) { char *tmp_comma; if ((tmp_comma = xstrstr(tmp_ptr, ","))) *tmp_comma = '\0'; i = time_str2secs(tmp_ptr + 11); if (i != NO_VAL) delay_boot = i; if (tmp_comma) *tmp_comma = ','; } bf_min_age_reserve = 0; if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_age_reserve="))) { int min_age = atoi(tmp_ptr + 19); if (min_age > 0) bf_min_age_reserve = min_age; } if (xstrcasestr(sched_params, "allow_zero_lic")) validate_cfgd_licenses = false; xfree(sched_params); } if (job_specs->array_bitmap) i = bit_set_count(job_specs->array_bitmap); else i = 1; if ((job_count + i) >= slurmctld_conf.max_job_cnt) { error("%s: MaxJobCount limit from slurm.conf reached (%u)", __func__, slurmctld_conf.max_job_cnt); return EAGAIN; } error_code = _job_create(job_specs, allocate, will_run, &job_ptr, submit_uid, err_msg, protocol_version); *job_pptr = job_ptr; if (error_code) { if (job_ptr && (immediate || will_run)) { /* this should never really happen here */ job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; xfree(job_ptr->state_desc); job_ptr->start_time = job_ptr->end_time = now; job_completion_logger(job_ptr, false); error("%s: setting %pJ to \"%s\"", __func__, job_ptr, job_reason_string(job_ptr->state_reason)); } return error_code; } xassert(job_ptr); if (job_specs->array_bitmap) independent = false; else independent = job_independent(job_ptr); /* * priority needs to be calculated after this since we set a * begin time in job_independent and that lets us know if the * job is eligible. */ if (job_ptr->priority == NO_VAL) set_job_prio(job_ptr); if (job_ptr->state_reason == WAIT_HELD_USER) held_user = true; if (independent && (license_job_test(job_ptr, time(NULL), true) != SLURM_SUCCESS)) independent = false; /* Avoid resource fragmentation if important */ if ((submit_uid || (job_specs->req_nodes == NULL)) && independent && job_is_completing(NULL)) too_fragmented = true; /* Don't pick nodes for job now */ /* * FIXME: Ideally we only want to refuse the request if the * required node list is insufficient to satisfy the job's * processor or node count requirements, but the overhead is * rather high to do that right here. We let requests from * user root proceed if a node list is specified, for * meta-schedulers (e.g. Maui, Moab, etc.). */ else too_fragmented = false; if (independent && (!too_fragmented) && !defer_sched) top_prio = _top_priority(job_ptr, job_specs->het_job_offset); else top_prio = true; /* don't bother testing, * it is not runable anyway */ if (immediate && (too_fragmented || (!top_prio) || (!independent) || defer_sched)) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; xfree(job_ptr->state_desc); job_ptr->start_time = job_ptr->end_time = now; job_completion_logger(job_ptr, false); if (!independent) { debug2("%s: setting %pJ to \"%s\" due to dependency (%s)", __func__, job_ptr, job_reason_string(job_ptr->state_reason), slurm_strerror(ESLURM_DEPENDENCY)); return ESLURM_DEPENDENCY; } else if (too_fragmented) { debug2("%s: setting %pJ to \"%s\" due to fragmentation (%s)", __func__, job_ptr, job_reason_string(job_ptr->state_reason), slurm_strerror(ESLURM_FRAGMENTATION)); return ESLURM_FRAGMENTATION; } else if (!top_prio) { debug2("%s: setting %pJ to \"%s\" because it's not top priority (%s)", __func__, job_ptr, job_reason_string(job_ptr->state_reason), slurm_strerror(ESLURM_NOT_TOP_PRIORITY)); return ESLURM_NOT_TOP_PRIORITY; } else { job_ptr->state_reason = FAIL_DEFER; debug2("%s: setting %pJ to \"%s\" due to SchedulerParameters=defer (%s)", __func__, job_ptr, job_reason_string(job_ptr->state_reason), slurm_strerror(ESLURM_DEFER)); return ESLURM_DEFER; } } if (will_run && resp) { job_desc_msg_t job_desc_msg; int rc; slurm_init_job_desc_msg(&job_desc_msg); job_desc_msg.job_id = job_ptr->job_id; rc = job_start_data(&job_desc_msg, resp); job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->start_time = job_ptr->end_time = now; purge_job_record(job_ptr->job_id); return rc; } /* * fed jobs need to go to the siblings first so don't attempt to * schedule the job now. */ test_only = will_run || job_ptr->deadline || (allocate == 0) || job_ptr->fed_details; no_alloc = test_only || too_fragmented || _has_deadline(job_ptr) || (!top_prio) || (!independent) || !avail_front_end(job_ptr) || (job_specs->het_job_offset != NO_VAL) || defer_sched; no_alloc = no_alloc || (bb_g_job_test_stage_in(job_ptr, no_alloc) != 1); error_code = _select_nodes_parts(job_ptr, no_alloc, NULL, err_msg); if (!test_only) { last_job_update = now; } if (held_user) job_ptr->state_reason = WAIT_HELD_USER; /* * Moved this (_create_job_array) here to handle when a job * array is submitted since we * want to know the array task count when we check the job against * QOS/Assoc limits */ _create_job_array(job_ptr, job_specs); slurmctld_diag_stats.jobs_submitted += (job_ptr->array_recs && job_ptr->array_recs->task_cnt) ? job_ptr->array_recs->task_cnt : 1; acct_policy_add_job_submit(job_ptr); if ((error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) && (slurmctld_conf.enforce_part_limits != PARTITION_ENFORCE_NONE)) ; /* Reject job submission */ else if ((error_code == ESLURM_NODES_BUSY) || (error_code == ESLURM_RESERVATION_BUSY) || (error_code == ESLURM_JOB_HELD) || (error_code == ESLURM_NODE_NOT_AVAIL) || (error_code == ESLURM_QOS_THRES) || (error_code == ESLURM_ACCOUNTING_POLICY) || (error_code == ESLURM_RESERVATION_NOT_USABLE) || (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) || (error_code == ESLURM_POWER_NOT_AVAIL) || (error_code == ESLURM_BURST_BUFFER_WAIT) || (error_code == ESLURM_POWER_RESERVED) || (error_code == ESLURM_PARTITION_DOWN)) { /* Not fatal error, but job can't be scheduled right now */ if (immediate) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; xfree(job_ptr->state_desc); job_ptr->start_time = job_ptr->end_time = now; job_completion_logger(job_ptr, false); debug2("%s: setting %pJ to \"%s\" because it cannot be immediately allocated (%s)", __func__, job_ptr, job_reason_string(job_ptr->state_reason), slurm_strerror(error_code)); } else { /* job remains queued */ if ((error_code == ESLURM_NODES_BUSY) || (error_code == ESLURM_BURST_BUFFER_WAIT) || (error_code == ESLURM_RESERVATION_BUSY) || (error_code == ESLURM_ACCOUNTING_POLICY) || ((error_code == ESLURM_PARTITION_DOWN) && (job_ptr->batch_flag))) { error_code = SLURM_SUCCESS; } } return error_code; } if (error_code) { /* fundamental flaw in job request */ job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_BAD_CONSTRAINTS; xfree(job_ptr->state_desc); job_ptr->start_time = job_ptr->end_time = now; job_completion_logger(job_ptr, false); debug2("%s: setting %pJ to \"%s\" due to a flaw in the job request (%s)", __func__, job_ptr, job_reason_string(job_ptr->state_reason), slurm_strerror(error_code)); return error_code; } if (will_run) { /* job would run, flag job destruction */ job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->start_time = job_ptr->end_time = now; purge_job_record(job_ptr->job_id); } else if (!with_slurmdbd) jobacct_storage_g_job_start(acct_db_conn, job_ptr); if (!will_run) { sched_debug2("%pJ allocated resources: NodeList=%s", job_ptr, job_ptr->nodes); rebuild_job_part_list(job_ptr); } return SLURM_SUCCESS; } /* * job_fail - terminate a job due to initiation failure * IN job_ptr - Pointer to job to be killed * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.) * RET 0 on success, otherwise ESLURM error code */ static int _job_fail(job_record_t *job_ptr, uint32_t job_state) { time_t now = time(NULL); bool suspended = false; if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; if (IS_JOB_SUSPENDED(job_ptr)) { uint32_t suspend_job_state = job_ptr->job_state; /* * we can't have it as suspended when we call the * accounting stuff. */ job_ptr->job_state = JOB_CANCELLED; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_ptr->job_state = suspend_job_state; suspended = true; } if (IS_JOB_CONFIGURING(job_ptr) || IS_JOB_RUNNING(job_ptr) || suspended) { /* No need to signal steps, deallocate kills them */ job_ptr->time_last_active = now; if (suspended) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); } else job_ptr->end_time = now; last_job_update = now; job_ptr->job_state = job_state | JOB_COMPLETING; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_LAUNCH; xfree(job_ptr->state_desc); job_completion_logger(job_ptr, false); if (job_ptr->node_bitmap) { build_cg_bitmap(job_ptr); deallocate_nodes(job_ptr, false, suspended, false); } return SLURM_SUCCESS; } /* All other states */ verbose("job_fail: %pJ can't be killed from state=%s", job_ptr, job_state_string(job_ptr->job_state)); return ESLURM_TRANSITION_STATE_NO_UPDATE; } /* * job_fail - terminate a job due to initiation failure * IN job_id - ID of the job to be killed * IN job_state - desired job state (JOB_BOOT_FAIL, JOB_NODE_FAIL, etc.) * RET 0 on success, otherwise ESLURM error code */ extern int job_fail(uint32_t job_id, uint32_t job_state) { job_record_t *job_ptr, *het_job, *het_job_leader; ListIterator iter; int rc = SLURM_SUCCESS, rc1; job_ptr = find_job_record(job_id); if (job_ptr == NULL) { error("job_fail: invalid JobId=%u", job_id); return ESLURM_INVALID_JOB_ID; } if (job_ptr->het_job_id == 0) return _job_fail(job_ptr, job_state); het_job_leader = find_job_record(job_ptr->het_job_id); if (!het_job_leader) { error("%s: Hetjob leader %pJ not found", __func__, job_ptr); return _job_fail(job_ptr, job_state); } if (!het_job_leader->het_job_list) { error("%s: Hetjob leader %pJ job list is NULL", __func__, job_ptr); return _job_fail(job_ptr, job_state); } iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) { if (het_job_leader->het_job_id != het_job->het_job_id) { error("%s: Bad het_job_list for %pJ", __func__, het_job_leader); continue; } rc1 = _job_fail(het_job, job_state); if (rc1 != SLURM_SUCCESS) rc = rc1; } list_iterator_destroy(iter); return rc; } /* * Signal a job based upon job pointer. * Authentication and authorization checks must be performed before calling. */ extern int job_signal(job_record_t *job_ptr, uint16_t signal, uint16_t flags, uid_t uid, bool preempt) { uint16_t job_term_state; time_t now = time(NULL); trace_job(job_ptr, __func__, "enter"); if (IS_JOB_STAGE_OUT(job_ptr) && (flags & KILL_HURRY)) { job_ptr->bit_flags |= JOB_KILL_HURRY; return bb_g_job_cancel(job_ptr); } if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; /* * If is origin job then cancel siblings -- if they exist. * origin job = because it knows where the siblings are * If the job is running locally then just do the normal signaling */ if (!(flags & KILL_NO_SIBS) && !IS_JOB_RUNNING(job_ptr) && job_ptr->fed_details && fed_mgr_fed_rec) { uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id); slurmdb_cluster_rec_t *origin = fed_mgr_get_cluster_by_id(origin_id); if (origin && (origin == fed_mgr_cluster_rec) && fed_mgr_job_started_on_sib(job_ptr)) { /* * If the job is running on a remote cluster then wait * for the job to report back that it's completed, * otherwise just signal the pending siblings and itself * (by not returning). */ return fed_mgr_job_cancel(job_ptr, signal, flags, uid, false); } else if (origin && (origin == fed_mgr_cluster_rec)) { /* cancel origin job and revoke sibling jobs */ fed_mgr_job_revoke_sibs(job_ptr); fed_mgr_remove_remote_dependencies(job_ptr); } else if (!origin || !origin->fed.send || (((slurm_persist_conn_t *)origin->fed.send)->fd == -1)) { /* * The origin is down just signal all of the viable * sibling jobs */ fed_mgr_job_cancel(job_ptr, signal, flags, uid, true); } } /* let node select plugin do any state-dependent signaling actions */ select_g_job_signal(job_ptr, signal); last_job_update = now; /* save user ID of the one who requested the job be cancelled */ if (signal == SIGKILL) job_ptr->requid = uid; if (IS_JOB_PENDING(job_ptr) && IS_JOB_COMPLETING(job_ptr) && (signal == SIGKILL)) { /* Prevent job requeue, otherwise preserve state */ job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING; /* build_cg_bitmap() not needed, job already completing */ verbose("%s: %u of requeuing %pJ successful", __func__, signal, job_ptr); return SLURM_SUCCESS; } if (flags & KILL_HURRY) job_ptr->bit_flags |= JOB_KILL_HURRY; if (IS_JOB_CONFIGURING(job_ptr) && (signal == SIGKILL)) { last_job_update = now; job_ptr->end_time = now; job_ptr->job_state = JOB_CANCELLED | JOB_COMPLETING; if (flags & KILL_FED_REQUEUE) job_ptr->job_state |= JOB_REQUEUE; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, false); deallocate_nodes(job_ptr, false, false, false); if (flags & KILL_FED_REQUEUE) { job_ptr->job_state &= (~JOB_REQUEUE); } verbose("%s: %u of configuring %pJ successful", __func__, signal, job_ptr); return SLURM_SUCCESS; } if (IS_JOB_PENDING(job_ptr) && (signal == SIGKILL)) { job_ptr->job_state = JOB_CANCELLED; if (flags & KILL_FED_REQUEUE) job_ptr->job_state |= JOB_REQUEUE; job_ptr->start_time = now; job_ptr->end_time = now; srun_allocate_abort(job_ptr); job_completion_logger(job_ptr, false); if (flags & KILL_FED_REQUEUE) { job_ptr->job_state &= (~JOB_REQUEUE); } /* * Send back a response to the origin cluster, in other cases * where the job is running the job will send back a response * after the job is is completed. This can happen when the * pending origin job is put into a hold state and the siblings * are removed or when the job is canceled from the origin. */ fed_mgr_job_complete(job_ptr, 0, now); verbose("%s: %u of pending %pJ successful", __func__, signal, job_ptr); return SLURM_SUCCESS; } if (preempt) job_term_state = JOB_PREEMPTED; else job_term_state = JOB_CANCELLED; if (IS_JOB_SUSPENDED(job_ptr) && (signal == SIGKILL)) { last_job_update = now; job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); job_ptr->job_state = job_term_state | JOB_COMPLETING; if (flags & KILL_FED_REQUEUE) job_ptr->job_state |= JOB_REQUEUE; build_cg_bitmap(job_ptr); jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_completion_logger(job_ptr, false); if (flags & KILL_FED_REQUEUE) job_ptr->job_state &= (~JOB_REQUEUE); deallocate_nodes(job_ptr, false, true, preempt); verbose("%s: %u of suspended %pJ successful", __func__, signal, job_ptr); return SLURM_SUCCESS; } if (IS_JOB_RUNNING(job_ptr)) { if ((signal == SIGSTOP) || (signal == SIGCONT)) { if (IS_JOB_SIGNALING(job_ptr)) { verbose("%s: %u not send to %pJ 0x%x", __func__, signal, job_ptr, job_ptr->job_state); return ESLURM_TRANSITION_STATE_NO_UPDATE; } job_ptr->job_state |= JOB_SIGNALING; } if ((signal == SIGKILL) && !(flags & KILL_STEPS_ONLY) && !(flags & KILL_JOB_BATCH)) { /* No need to signal steps, deallocate kills them */ job_ptr->time_last_active = now; job_ptr->end_time = now; last_job_update = now; job_ptr->job_state = job_term_state | JOB_COMPLETING; if (flags & KILL_FED_REQUEUE) job_ptr->job_state |= JOB_REQUEUE; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, false); deallocate_nodes(job_ptr, false, false, preempt); if (flags & KILL_FED_REQUEUE) job_ptr->job_state &= (~JOB_REQUEUE); } else if (job_ptr->batch_flag && (flags & KILL_JOB_BATCH)) { _signal_batch_job(job_ptr, signal, flags); } else if ((flags & KILL_JOB_BATCH) && !job_ptr->batch_flag) { if ((signal == SIGSTOP) || (signal == SIGCONT)) job_ptr->job_state &= ~JOB_SIGNALING; return ESLURM_JOB_SCRIPT_MISSING; } else { _signal_job(job_ptr, signal, flags); } verbose("%s: %u of running %pJ successful 0x%x", __func__, signal, job_ptr, job_ptr->job_state); return SLURM_SUCCESS; } verbose("%s: %pJ can't be sent signal %u from state=%s", __func__, job_ptr, signal, job_state_string(job_ptr->job_state)); trace_job(job_ptr, __func__, "return"); return ESLURM_TRANSITION_STATE_NO_UPDATE; } /* * job_signal_id - signal the specified job * IN job_id - id of the job to be signaled * IN signal - signal to send, SIGKILL == cancel the job * IN flags - see KILL_JOB_* flags in slurm.h * IN uid - uid of requesting user * IN preempt - true if job being preempted * RET 0 on success, otherwise ESLURM error code */ extern int job_signal_id(uint32_t job_id, uint16_t signal, uint16_t flags, uid_t uid, bool preempt) { job_record_t *job_ptr; job_ptr = find_job_record(job_id); if (job_ptr == NULL) { info("%s: invalid JobId=%u", __func__, job_id); return ESLURM_INVALID_JOB_ID; } if ((job_ptr->user_id != uid) && !validate_operator(uid) && !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) { error("Security violation, JOB_CANCEL RPC for %pJ from uid %u", job_ptr, uid); return ESLURM_ACCESS_DENIED; } return job_signal(job_ptr, signal, flags, uid, preempt); } /* Signal all components of a hetjob */ extern int het_job_signal(job_record_t *het_job_leader, uint16_t signal, uint16_t flags, uid_t uid, bool preempt) { ListIterator iter; int rc = SLURM_SUCCESS, rc1; job_record_t *het_job; iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) { if (het_job_leader->het_job_id != het_job->het_job_id) { error("%s: Bad het_job_list for %pJ", __func__, het_job_leader); continue; } rc1 = job_signal(het_job, signal, flags, uid, preempt); if (rc1 != SLURM_SUCCESS) rc = rc1; } list_iterator_destroy(iter); return rc; } static bool _get_whole_hetjob(void) { static time_t sched_update = 0; static bool whole_hetjob = false; char *sched_params = NULL; if (sched_update != slurmctld_conf.last_update) { sched_update = slurmctld_conf.last_update; sched_params = slurm_get_sched_params(); if (xstrcasestr(sched_params, "whole_hetjob") || xstrcasestr(sched_params, "whole_pack")) whole_hetjob = true; else whole_hetjob = false; xfree(sched_params); } return whole_hetjob; } /* * job_str_signal - signal the specified job * IN job_id_str - id of the job to be signaled, valid formats include "#" * "#_#" and "#_[expr]" * IN signal - signal to send, SIGKILL == cancel the job * IN flags - see KILL_JOB_* flags in slurm.h * IN uid - uid of requesting user * IN preempt - true if job being preempted * RET 0 on success, otherwise ESLURM error code */ extern int job_str_signal(char *job_id_str, uint16_t signal, uint16_t flags, uid_t uid, bool preempt) { job_record_t *job_ptr; uint32_t job_id; time_t now = time(NULL); char *end_ptr = NULL, *tok, *tmp; long int long_id; bitstr_t *array_bitmap = NULL; bool valid = true; int32_t i, i_first, i_last; int rc = SLURM_SUCCESS, rc2, len; if (max_array_size == NO_VAL) { max_array_size = slurmctld_conf.max_array_sz; } long_id = strtol(job_id_str, &end_ptr, 10); if ((long_id <= 0) || (long_id == LONG_MAX) || ((end_ptr[0] != '\0') && (end_ptr[0] != '_') && (end_ptr[0] != '+'))) { info("%s(1): invalid JobId=%s", __func__, job_id_str); return ESLURM_INVALID_JOB_ID; } if ((end_ptr[0] == '_') && (end_ptr[1] == '*')) end_ptr += 2; /* Defaults to full job array */ if (end_ptr[0] == '+') { /* Signal hetjob element */ job_id = (uint32_t) long_id; long_id = strtol(end_ptr + 1, &end_ptr, 10); if ((long_id < 0) || (long_id == LONG_MAX) || (end_ptr[0] != '\0')) { info("%s(2): invalid JobId=%s", __func__, job_id_str); return ESLURM_INVALID_JOB_ID; } job_ptr = find_het_job_record(job_id, (uint32_t) long_id); if (!job_ptr) return ESLURM_ALREADY_DONE; if ((job_ptr->user_id != uid) && !validate_operator(uid) && !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) { error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u", job_ptr, uid); return ESLURM_ACCESS_DENIED; } if (IS_JOB_PENDING(job_ptr)) return ESLURM_NOT_WHOLE_HET_JOB; return job_signal(job_ptr, signal, flags, uid,preempt); } last_job_update = now; job_id = (uint32_t) long_id; if (end_ptr[0] == '\0') { /* Single job (or full job array) */ int jobs_done = 0, jobs_signaled = 0; job_record_t *job_ptr_done = NULL; job_ptr = find_job_record(job_id); if (job_ptr && (job_ptr->user_id != uid) && !validate_operator(uid) && !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) { error("Security violation, REQUEST_KILL_JOB RPC for %pJ from uid %u", job_ptr, uid); return ESLURM_ACCESS_DENIED; } if (job_ptr && job_ptr->het_job_list) { /* Hetjob leader */ return het_job_signal(job_ptr, signal, flags, uid, preempt); } if (job_ptr && job_ptr->het_job_id && _get_whole_hetjob()) { job_record_t *het_job_leader; het_job_leader = find_job_record(job_ptr->het_job_id); if (het_job_leader && het_job_leader->het_job_list) { return het_job_signal(het_job_leader, signal, flags, uid, preempt); } error("%s: Hetjob leader %pJ not found", __func__, job_ptr); } if (job_ptr && job_ptr->het_job_id && IS_JOB_PENDING(job_ptr)) return ESLURM_NOT_WHOLE_HET_JOB;/* Hetjob child */ if (job_ptr && (job_ptr->array_task_id == NO_VAL) && (job_ptr->array_recs == NULL)) { /* This is a regular job, not a job array */ return job_signal_id(job_id, signal, flags, uid, preempt); } /* * This will kill the meta record that holds all * pending jobs. We want to kill this first so we * don't start jobs just to kill them as we are * killing other elements of the array. */ if (job_ptr && job_ptr->array_recs) { /* This is a job array */ job_ptr_done = job_ptr; rc = job_signal(job_ptr, signal, flags, uid, preempt); if (rc == ESLURM_ACCESS_DENIED) return rc; jobs_signaled++; if (rc == ESLURM_ALREADY_DONE) { jobs_done++; rc = SLURM_SUCCESS; } } /* Signal all tasks of this job array */ job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; if (!job_ptr && !job_ptr_done) { info("%s(3): invalid JobId=%u", __func__, job_id); return ESLURM_INVALID_JOB_ID; } while (job_ptr) { if (job_ptr->array_job_id == job_id) break; job_ptr = job_ptr->job_array_next_j; } while (job_ptr) { if ((job_ptr->array_job_id == job_id) && (job_ptr != job_ptr_done)) { rc2 = job_signal(job_ptr, signal, flags, uid, preempt); jobs_signaled++; if (rc2 == ESLURM_ALREADY_DONE) { jobs_done++; } else { rc = MAX(rc, rc2); } } job_ptr = job_ptr->job_array_next_j; } if ((rc == SLURM_SUCCESS) && (jobs_done == jobs_signaled)) return ESLURM_ALREADY_DONE; return rc; } array_bitmap = bit_alloc(max_array_size); tmp = xstrdup(end_ptr + 1); tok = strtok_r(tmp, ",", &end_ptr); while (tok && valid) { valid = _parse_array_tok(tok, array_bitmap, max_array_size); tok = strtok_r(NULL, ",", &end_ptr); } xfree(tmp); if (valid) { i_last = bit_fls(array_bitmap); if (i_last < 0) valid = false; } if (!valid) { info("%s(4): invalid JobId=%s", __func__, job_id_str); rc = ESLURM_INVALID_JOB_ID; goto endit; } /* Find some job record and validate the user signaling the job */ job_ptr = find_job_record(job_id); if (job_ptr == NULL) { job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; while (job_ptr) { if (job_ptr->array_job_id == job_id) break; job_ptr = job_ptr->job_array_next_j; } } if ((job_ptr == NULL) || ((job_ptr->array_task_id == NO_VAL) && (job_ptr->array_recs == NULL))) { info("%s(5): invalid JobId=%s", __func__, job_id_str); rc = ESLURM_INVALID_JOB_ID; goto endit; } if ((job_ptr->user_id != uid) && !validate_operator(uid) && !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) { error("%s: Security violation JOB_CANCEL RPC for %pJ from uid %u", __func__, job_ptr, uid); rc = ESLURM_ACCESS_DENIED; goto endit; } if (IS_JOB_PENDING(job_ptr) && job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) { /* Ensure bitmap sizes match for AND operations */ len = bit_size(job_ptr->array_recs->task_id_bitmap); i_last++; if (i_last < len) { array_bitmap = bit_realloc(array_bitmap, len); } else { array_bitmap = bit_realloc(array_bitmap, i_last); job_ptr->array_recs->task_id_bitmap = bit_realloc( job_ptr->array_recs->task_id_bitmap, i_last); } if (signal == SIGKILL) { uint32_t orig_task_cnt, new_task_count; /* task_id_bitmap changes, so we need a copy of it */ bitstr_t *task_id_bitmap_orig = bit_copy(job_ptr->array_recs->task_id_bitmap); bit_and_not(job_ptr->array_recs->task_id_bitmap, array_bitmap); xfree(job_ptr->array_recs->task_id_str); orig_task_cnt = job_ptr->array_recs->task_cnt; new_task_count = bit_set_count(job_ptr->array_recs-> task_id_bitmap); if (!new_task_count) { last_job_update = now; job_ptr->job_state = JOB_CANCELLED; job_ptr->start_time = now; job_ptr->end_time = now; job_ptr->requid = uid; srun_allocate_abort(job_ptr); job_completion_logger(job_ptr, false); /* * Master job record, even wihtout tasks, * counts as one job record */ job_count -= (orig_task_cnt - 1); } else { _job_array_comp(job_ptr, false, false); job_count -= (orig_task_cnt - new_task_count); /* * Since we are altering the job array's * task_cnt we must go alter this count in the * acct_policy code as if they are finishing * (accrue_cnt/job_submit etc...). */ if (job_ptr->array_recs->task_cnt > new_task_count) { uint32_t tmp_state = job_ptr->job_state; job_ptr->job_state = JOB_CANCELLED; job_ptr->array_recs->task_cnt -= new_task_count; acct_policy_remove_job_submit(job_ptr); job_ptr->bit_flags &= ~JOB_ACCRUE_OVER; job_ptr->job_state = tmp_state; } } /* * Set the task_cnt here since * job_completion_logger needs the total * pending count to handle the acct_policy * limit for submitted jobs correctly. */ job_ptr->array_recs->task_cnt = new_task_count; bit_and_not(array_bitmap, task_id_bitmap_orig); FREE_NULL_BITMAP(task_id_bitmap_orig); } else { bit_and_not(array_bitmap, job_ptr->array_recs->task_id_bitmap); rc = ESLURM_TRANSITION_STATE_NO_UPDATE; } } i_first = bit_ffs(array_bitmap); if (i_first >= 0) i_last = bit_fls(array_bitmap); else i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(array_bitmap, i)) continue; job_ptr = find_job_array_rec(job_id, i); if (job_ptr == NULL) { info("%s(6): invalid JobId=%u_%d", __func__, job_id, i); rc = ESLURM_INVALID_JOB_ID; continue; } rc2 = job_signal(job_ptr, signal, flags, uid, preempt); rc = MAX(rc, rc2); } endit: FREE_NULL_BITMAP(array_bitmap); return rc; } static void _signal_batch_job(job_record_t *job_ptr, uint16_t signal, uint16_t flags) { bitoff_t i; signal_tasks_msg_t *signal_tasks_msg = NULL; agent_arg_t *agent_args = NULL; xassert(job_ptr); xassert(job_ptr->batch_host); i = bit_ffs(job_ptr->node_bitmap); if (i < 0) { error("%s: %pJ lacks assigned nodes", __func__, job_ptr); return; } agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_SIGNAL_TASKS; agent_args->retry = 1; agent_args->node_count = 1; #ifdef HAVE_FRONT_END if (job_ptr->front_end_ptr) agent_args->protocol_version = job_ptr->front_end_ptr->protocol_version; #else node_record_t *node_ptr; if ((node_ptr = find_node_record(job_ptr->batch_host))) agent_args->protocol_version = node_ptr->protocol_version; #endif agent_args->hostlist = hostlist_create(job_ptr->batch_host); signal_tasks_msg = xmalloc(sizeof(signal_tasks_msg_t)); signal_tasks_msg->job_id = job_ptr->job_id; signal_tasks_msg->job_step_id = NO_VAL; signal_tasks_msg->flags = flags; signal_tasks_msg->signal = signal; agent_args->msg_args = signal_tasks_msg; agent_queue_request(agent_args); return; } /* * prolog_complete - note the normal termination of the prolog * IN job_id - id of the job which completed * IN prolog_return_code - prolog's return code, * if set then set job state to FAILED * RET - 0 on success, otherwise ESLURM error code * global: job_list - pointer global job list * last_job_update - time of last job table update */ extern int prolog_complete(uint32_t job_id, uint32_t prolog_return_code) { job_record_t *job_ptr; job_ptr = find_job_record(job_id); if (job_ptr == NULL) { info("prolog_complete: invalid JobId=%u", job_id); return ESLURM_INVALID_JOB_ID; } if (IS_JOB_COMPLETING(job_ptr)) return SLURM_SUCCESS; if (prolog_return_code) error("Prolog launch failure, %pJ", job_ptr); job_ptr->state_reason = WAIT_NO_REASON; return SLURM_SUCCESS; } static int _job_complete(job_record_t *job_ptr, uid_t uid, bool requeue, bool node_fail, uint32_t job_return_code) { node_record_t *node_ptr; time_t now = time(NULL); uint32_t job_comp_flag = 0; bool suspended = false; int i; int use_cloud = false; uint16_t over_time_limit; xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); xassert(verify_lock(FED_LOCK, READ_LOCK)); if (IS_JOB_FINISHED(job_ptr)) { if (job_ptr->exit_code == 0) job_ptr->exit_code = job_return_code; return ESLURM_ALREADY_DONE; } if (IS_JOB_COMPLETING(job_ptr)) return SLURM_SUCCESS; /* avoid replay */ if ((job_return_code & 0xff) == SIG_OOM) { info("%s: %pJ OOM failure", __func__, job_ptr); } else if (WIFSIGNALED(job_return_code)) { info("%s: %pJ WTERMSIG %d", __func__, job_ptr, WTERMSIG(job_return_code)); } else if (WIFEXITED(job_return_code)) { info("%s: %pJ WEXITSTATUS %d", __func__, job_ptr, WEXITSTATUS(job_return_code)); } if (IS_JOB_RUNNING(job_ptr)) job_comp_flag = JOB_COMPLETING; else if (IS_JOB_PENDING(job_ptr)) { job_return_code = NO_VAL; job_ptr->start_time = now; fed_mgr_job_revoke_sibs(job_ptr); } if ((job_return_code == NO_VAL) && (IS_JOB_RUNNING(job_ptr) || IS_JOB_PENDING(job_ptr))) { if (node_fail) { info("%s: %pJ cancelled by node failure", __func__, job_ptr); } else { info("%s: %pJ cancelled by interactive user", __func__, job_ptr); } } if (IS_JOB_SUSPENDED(job_ptr)) { uint32_t suspend_job_state = job_ptr->job_state; /* * we can't have it as suspended when we call the * accounting stuff. */ job_ptr->job_state = JOB_CANCELLED; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_ptr->job_state = suspend_job_state; job_comp_flag = JOB_COMPLETING; suspended = true; } if (job_comp_flag && (job_ptr->node_cnt == 0)) { /* * Job has no resources left (used to expand another job). * Avoid duplicate run of epilog and underflow in CPU count. */ job_comp_flag = 0; } if (requeue && job_ptr->details && job_ptr->batch_flag) { /* * We want this job to look like it was terminated in the * accounting logs. Set a new submit time so the restarted * job looks like a new job. */ job_ptr->end_time = now; job_ptr->job_state = JOB_NODE_FAIL; job_completion_logger(job_ptr, true); /* * Do this after the epilog complete. * Setting it here is too early. */ //job_ptr->db_index = 0; //job_ptr->details->submit_time = now + 1; if (job_ptr->node_bitmap) { i = bit_ffs(job_ptr->node_bitmap); if (i >= 0) { node_ptr = node_record_table_ptr + i; if (IS_NODE_CLOUD(node_ptr)) use_cloud = true; } } if (!use_cloud) job_ptr->batch_flag++; /* only one retry */ job_ptr->restart_cnt++; /* clear signal sent flag on requeue */ job_ptr->warn_flags &= ~WARN_SENT; job_ptr->job_state = JOB_PENDING | job_comp_flag; /* * Since the job completion logger removes the job submit * information, we need to add it again. */ acct_policy_add_job_submit(job_ptr); if (node_fail) { info("%s: requeue %pJ due to node failure", __func__, job_ptr); } else { info("%s: requeue %pJ per user/system request", __func__, job_ptr); } /* * We have reached the maximum number of requeue * attempts hold the job with HoldMaxRequeue reason. */ if (job_ptr->batch_flag > MAX_BATCH_REQUEUE) { job_ptr->job_state |= JOB_REQUEUE_HOLD; job_ptr->state_reason = WAIT_MAX_REQUEUE; job_ptr->batch_flag = 1; debug("%s: Holding %pJ, repeated requeue failures", __func__, job_ptr); job_ptr->priority = 0; } } else if (IS_JOB_PENDING(job_ptr) && job_ptr->details && job_ptr->batch_flag) { /* * Possible failure mode with DOWN node and job requeue. * The DOWN node might actually respond to the cancel and * take us here. Don't run job_completion_logger here since * this is here to catch duplicate cancels from slowly * responding slurmds */ return SLURM_SUCCESS; } else { if (job_ptr->part_ptr && (job_ptr->part_ptr->over_time_limit != NO_VAL16)) { over_time_limit = job_ptr->part_ptr->over_time_limit; } else { over_time_limit = slurmctld_conf.over_time_limit; } if (node_fail) { job_ptr->job_state = JOB_NODE_FAIL | job_comp_flag; job_ptr->requid = uid; } else if (job_return_code == NO_VAL) { job_ptr->job_state = JOB_CANCELLED | job_comp_flag; job_ptr->requid = uid; } else if ((job_return_code & 0xff) == SIG_OOM) { job_ptr->job_state = JOB_OOM | job_comp_flag; job_ptr->exit_code = job_return_code; job_ptr->state_reason = FAIL_OOM; xfree(job_ptr->state_desc); } else if (WIFEXITED(job_return_code) && WEXITSTATUS(job_return_code)) { job_ptr->job_state = JOB_FAILED | job_comp_flag; job_ptr->exit_code = job_return_code; job_ptr->state_reason = FAIL_EXIT_CODE; xfree(job_ptr->state_desc); } else if (WIFSIGNALED(job_return_code)) { job_ptr->job_state = JOB_FAILED | job_comp_flag; job_ptr->exit_code = job_return_code; job_ptr->state_reason = FAIL_LAUNCH; } else if (job_comp_flag && ((job_ptr->end_time + over_time_limit * 60) < now)) { /* * Test if the job has finished before its allowed * over time has expired. */ job_ptr->job_state = JOB_TIMEOUT | job_comp_flag; job_ptr->state_reason = FAIL_TIMEOUT; xfree(job_ptr->state_desc); } else { job_ptr->job_state = JOB_COMPLETE | job_comp_flag; job_ptr->exit_code = job_return_code; if (nonstop_ops.job_fini) (nonstop_ops.job_fini)(job_ptr); } if (suspended) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); } else job_ptr->end_time = now; job_completion_logger(job_ptr, false); } last_job_update = now; job_ptr->time_last_active = now; /* Timer for resending kill RPC */ if (job_comp_flag) { /* job was running */ build_cg_bitmap(job_ptr); deallocate_nodes(job_ptr, false, suspended, false); } /* Check for and cleanup stuck scripts */ if (job_ptr->details && job_ptr->details->prolog_running) track_script_flush_job(job_ptr->job_id); info("%s: %pJ done", __func__, job_ptr); return SLURM_SUCCESS; } /* * job_complete - note the normal termination the specified job * IN job_id - id of the job which completed * IN uid - user id of user issuing the RPC * IN requeue - job should be run again if possible * IN node_fail - true if job terminated due to node failure * IN job_return_code - job's return code, if set then set state to FAILED * RET - 0 on success, otherwise ESLURM error code * global: job_list - pointer global job list * last_job_update - time of last job table update */ extern int job_complete(uint32_t job_id, uid_t uid, bool requeue, bool node_fail, uint32_t job_return_code) { job_record_t *job_ptr, *het_job_ptr; ListIterator iter; int rc, rc1; xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); xassert(verify_lock(FED_LOCK, READ_LOCK)); job_ptr = find_job_record(job_id); if (job_ptr == NULL) { info("%s: invalid JobId=%u", __func__, job_id); return ESLURM_INVALID_JOB_ID; } if ((job_ptr->user_id != uid) && !validate_slurm_user(uid)) { error("%s: Security violation, JOB_COMPLETE RPC for %pJ from uid %u", __func__, job_ptr, uid); return ESLURM_USER_ID_MISSING; } if (job_ptr->het_job_list) { rc = SLURM_SUCCESS; iter = list_iterator_create(job_ptr->het_job_list); while ((het_job_ptr = list_next(iter))) { if (job_ptr->het_job_id != het_job_ptr->het_job_id) { error("%s: Bad het_job_list for %pJ", __func__, job_ptr); continue; } rc1 = _job_complete(het_job_ptr, uid, requeue, node_fail, job_return_code); if (rc1 != SLURM_SUCCESS) rc = rc1; } list_iterator_destroy(iter); } else { rc = _job_complete(job_ptr, uid, requeue, node_fail, job_return_code); } return rc; } static int _alt_part_test(part_record_t *part_ptr, part_record_t **part_ptr_new) { part_record_t *alt_part_ptr = NULL; char *alt_name; *part_ptr_new = NULL; if ((part_ptr->state_up & PARTITION_SUBMIT) == 0) { info("_alt_part_test: original partition is not available " "(drain or inactive): %s", part_ptr->name); alt_name = part_ptr->alternate; while (alt_name) { alt_part_ptr = find_part_record(alt_name); if (alt_part_ptr == NULL) { info("_alt_part_test: invalid alternate " "partition name specified: %s", alt_name); return ESLURM_INVALID_PARTITION_NAME; } if (alt_part_ptr == part_ptr) { info("_alt_part_test: no valid alternate " "partition is available"); return ESLURM_PARTITION_NOT_AVAIL; } if (alt_part_ptr->state_up & PARTITION_SUBMIT) break; /* Try next alternate in the sequence */ alt_name = alt_part_ptr->alternate; } if (alt_name == NULL) { info("_alt_part_test: no valid alternate partition is " "available"); return ESLURM_PARTITION_NOT_AVAIL; } *part_ptr_new = alt_part_ptr; } return SLURM_SUCCESS; } /* * Test if this job can use this partition * * NOTE: This function is also called with a dummy job_desc_msg_t from * job_limits_check() if there is any new check added here you may also have to * add that parameter to the job_desc_msg_t in that function. */ static int _part_access_check(part_record_t *part_ptr, job_desc_msg_t *job_desc, bitstr_t *req_bitmap, uid_t submit_uid, slurmdb_qos_rec_t *qos_ptr, char *acct) { uint32_t total_nodes, min_nodes_tmp, max_nodes_tmp; uint32_t job_min_nodes, job_max_nodes; int rc = SLURM_SUCCESS; if ((part_ptr->flags & PART_FLAG_REQ_RESV) && (!job_desc->reservation || job_desc->reservation[0] == '\0')) { debug2("%s: uid %u access to partition %s " "denied, requires reservation", __func__, (unsigned int) submit_uid, part_ptr->name); return ESLURM_ACCESS_DENIED; } if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) && (submit_uid != 0) && (submit_uid != slurmctld_conf.slurm_user_id)) { debug2("%s: uid %u access to partition %s " "denied, not root", __func__, (unsigned int) submit_uid, part_ptr->name); return ESLURM_ACCESS_DENIED; } if ((job_desc->user_id == 0) && (part_ptr->flags & PART_FLAG_NO_ROOT)) { error("%s: Security violation, SUBMIT_JOB for " "user root disabled", __func__); return ESLURM_USER_ID_MISSING; } if (validate_group(part_ptr, job_desc->user_id) == 0) { debug2("%s: uid %u access to partition %s " "denied, bad group", __func__, (unsigned int) job_desc->user_id, part_ptr->name); return ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP; } if (validate_alloc_node(part_ptr, job_desc->alloc_node) == 0) { debug2("%s: uid %u access to partition %s " "denied, bad allocating node: %s", __func__, (unsigned int) job_desc->user_id, part_ptr->name, job_desc->alloc_node); return ESLURM_ACCESS_DENIED; } if ((part_ptr->state_up & PARTITION_SCHED) && (job_desc->min_cpus != NO_VAL)) { if (job_desc->min_cpus > part_ptr->total_cpus) { debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)", __func__, job_desc->min_cpus, part_ptr->name, part_ptr->total_cpus); return ESLURM_TOO_MANY_REQUESTED_CPUS; } else if (job_desc->min_cpus > (part_ptr->max_cpus_per_node * part_ptr->total_nodes)) { debug2("%s: Job requested too many CPUs (%u) of partition %s(%u)", __func__, job_desc->min_cpus, part_ptr->name, (part_ptr->max_cpus_per_node * part_ptr->total_nodes)); return ESLURM_TOO_MANY_REQUESTED_CPUS; } } /* Check against total nodes on the partition */ total_nodes = part_ptr->total_nodes; if ((part_ptr->state_up & PARTITION_SCHED) && (job_desc->min_nodes != NO_VAL) && (job_desc->min_nodes > total_nodes)) { debug2("%s: Job requested too many nodes (%u) " "of partition %s(%u)", __func__, job_desc->min_nodes, part_ptr->name, total_nodes); return ESLURM_INVALID_NODE_COUNT; } if (req_bitmap && !bit_super_set(req_bitmap, part_ptr->node_bitmap)) { debug2("%s: requested nodes %s not in partition %s", __func__, job_desc->req_nodes, part_ptr->name); return ESLURM_REQUESTED_NODES_NOT_IN_PARTITION; } /* The node counts have not been altered yet, so do not figure them out * by using the cpu counts. The partitions have already been altered * so we have to use the original values. */ job_min_nodes = job_desc->min_nodes; job_max_nodes = job_desc->max_nodes; min_nodes_tmp = part_ptr->min_nodes; max_nodes_tmp = part_ptr->max_nodes; /* Check against min/max node limits in the partition */ if ((part_ptr->state_up & PARTITION_SCHED) && (job_min_nodes != NO_VAL) && (job_min_nodes < min_nodes_tmp) && (!qos_ptr || (qos_ptr && !(qos_ptr->flags & QOS_FLAG_PART_MIN_NODE)))) { debug2("%s: Job requested for nodes (%u) " "smaller than partition %s(%u) min nodes", __func__, job_min_nodes, part_ptr->name, min_nodes_tmp); return ESLURM_INVALID_NODE_COUNT; } if ((part_ptr->state_up & PARTITION_SCHED) && (job_max_nodes != NO_VAL) && (job_max_nodes > max_nodes_tmp) && (!qos_ptr || (qos_ptr && !(qos_ptr->flags & QOS_FLAG_PART_MAX_NODE)))) { debug2("%s: Job requested for nodes (%u) greater than partition" " %s(%u) max nodes", __func__, job_max_nodes, part_ptr->name, max_nodes_tmp); return ESLURM_INVALID_NODE_COUNT; } if ((part_ptr->state_up & PARTITION_SCHED) && (job_desc->time_limit != NO_VAL) && (job_desc->time_limit > part_ptr->max_time) && (!qos_ptr || !(qos_ptr->flags & QOS_FLAG_PART_TIME_LIMIT))) { debug2("%s: Job time limit (%u) exceeds limit of partition " "%s(%u)", __func__, job_desc->time_limit, part_ptr->name, part_ptr->max_time); return ESLURM_INVALID_TIME_LIMIT; } if (slurmctld_conf.enforce_part_limits) { if ((rc = part_policy_valid_acct(part_ptr, acct, NULL)) != SLURM_SUCCESS) goto fini; if ((rc = part_policy_valid_qos(part_ptr, qos_ptr, NULL)) != SLURM_SUCCESS) goto fini; } fini: return rc; } static int _get_job_parts(job_desc_msg_t *job_desc, part_record_t **part_pptr, List *part_pptr_list, char **err_msg) { part_record_t *part_ptr = NULL, *part_ptr_new = NULL; List part_ptr_list = NULL; int rc = SLURM_SUCCESS; /* Identify partition(s) and set pointer(s) to their struct */ if (job_desc->partition) { char *err_part = NULL; part_ptr = find_part_record(job_desc->partition); if (part_ptr == NULL) { part_ptr_list = get_part_list(job_desc->partition, &err_part); if (part_ptr_list) { part_ptr = list_peek(part_ptr_list); if (list_count(part_ptr_list) == 1) FREE_NULL_LIST(part_ptr_list); } } if (part_ptr == NULL) { info("%s: invalid partition specified: %s", __func__, job_desc->partition); if (err_msg) { xfree(*err_msg); xstrfmtcat(*err_msg, "invalid partition specified: %s", err_part); xfree(err_part); } return ESLURM_INVALID_PARTITION_NAME; } } else if (job_desc->reservation && job_desc->reservation[0] != '\0' ) { slurmctld_resv_t *resv_ptr = NULL; resv_ptr = find_resv_name(job_desc->reservation); if (resv_ptr) part_ptr = resv_ptr->part_ptr; if (part_ptr) job_desc->partition = xstrdup(part_ptr->name); } if (!part_ptr) { if (default_part_loc == NULL) { error("%s: default partition not set", __func__); return ESLURM_DEFAULT_PARTITION_NOT_SET; } part_ptr = default_part_loc; job_desc->partition = xstrdup(part_ptr->name); } /* Change partition pointer(s) to alternates as needed */ if (part_ptr_list) { int fail_rc = SLURM_SUCCESS; part_record_t *part_ptr_tmp; bool rebuild_name_list = false; ListIterator iter = list_iterator_create(part_ptr_list); while ((part_ptr_tmp = list_next(iter))) { rc = _alt_part_test(part_ptr_tmp, &part_ptr_new); if (rc != SLURM_SUCCESS) { fail_rc = rc; list_remove(iter); rebuild_name_list = true; continue; } if (part_ptr_new) { list_insert(iter, part_ptr_new); list_remove(iter); rebuild_name_list = true; } } list_iterator_destroy(iter); if (list_is_empty(part_ptr_list)) { if (fail_rc != SLURM_SUCCESS) rc = fail_rc; else rc = ESLURM_PARTITION_NOT_AVAIL; goto fini; } rc = SLURM_SUCCESS; /* At least some partition usable */ if (rebuild_name_list) { part_ptr = NULL; xfree(job_desc->partition); iter = list_iterator_create(part_ptr_list); while ((part_ptr_tmp = list_next(iter))) { if (job_desc->partition) xstrcat(job_desc->partition, ","); else part_ptr = part_ptr_tmp; xstrcat(job_desc->partition, part_ptr_tmp->name); } list_iterator_destroy(iter); if (!part_ptr) { rc = ESLURM_PARTITION_NOT_AVAIL; goto fini; } } } else { rc = _alt_part_test(part_ptr, &part_ptr_new); if (rc != SLURM_SUCCESS) goto fini; if (part_ptr_new) { part_ptr = part_ptr_new; xfree(job_desc->partition); job_desc->partition = xstrdup(part_ptr->name); } } *part_pptr = part_ptr; if (part_pptr_list) { *part_pptr_list = part_ptr_list; part_ptr_list = NULL; } else FREE_NULL_LIST(part_ptr_list); fini: return rc; } static int _valid_job_part(job_desc_msg_t *job_desc, uid_t submit_uid, bitstr_t *req_bitmap, part_record_t *part_ptr, List part_ptr_list, slurmdb_assoc_rec_t *assoc_ptr, slurmdb_qos_rec_t *qos_ptr) { int rc = SLURM_SUCCESS; part_record_t *part_ptr_tmp; slurmdb_assoc_rec_t assoc_rec; uint32_t min_nodes_orig = INFINITE, max_nodes_orig = 1; uint32_t max_time = 0; bool any_check = false; /* Change partition pointer(s) to alternates as needed */ if (part_ptr_list) { int fail_rc = SLURM_SUCCESS; ListIterator iter = list_iterator_create(part_ptr_list); while ((part_ptr_tmp = list_next(iter))) { /* * FIXME: When dealing with multiple partitions we * currently can't deal with partition based * associations. */ memset(&assoc_rec, 0, sizeof(assoc_rec)); if (assoc_ptr) { assoc_rec.acct = assoc_ptr->acct; assoc_rec.partition = part_ptr_tmp->name; assoc_rec.uid = job_desc->user_id; (void) assoc_mgr_fill_in_assoc( acct_db_conn, &assoc_rec, accounting_enforce, NULL, false); } if (assoc_ptr && assoc_rec.id != assoc_ptr->id) { info("%s: can't check multiple " "partitions with partition based " "associations", __func__); rc = SLURM_ERROR; } else { rc = _part_access_check(part_ptr_tmp, job_desc, req_bitmap, submit_uid, qos_ptr, assoc_ptr ? assoc_ptr->acct : NULL); } if ((rc != SLURM_SUCCESS) && ((rc == ESLURM_ACCESS_DENIED) || (rc == ESLURM_USER_ID_MISSING) || (rc == ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP) || (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ALL))) { fail_rc = rc; break; } else if (rc != SLURM_SUCCESS) { fail_rc = rc; } else { any_check = true; } /* Set to success since we found a usable partition */ if (any_check && slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ANY) fail_rc = SLURM_SUCCESS; min_nodes_orig = MIN(min_nodes_orig, part_ptr_tmp->min_nodes_orig); max_nodes_orig = MAX(max_nodes_orig, part_ptr_tmp->max_nodes_orig); max_time = MAX(max_time, part_ptr_tmp->max_time); } list_iterator_destroy(iter); if (list_is_empty(part_ptr_list) || (slurmctld_conf.enforce_part_limits && (fail_rc != SLURM_SUCCESS))) { if (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ALL) rc = fail_rc; else if (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ANY && !any_check) rc = fail_rc; else { rc = ESLURM_PARTITION_NOT_AVAIL; } goto fini; } rc = SLURM_SUCCESS; /* At least some partition usable */ } else { min_nodes_orig = part_ptr->min_nodes_orig; max_nodes_orig = part_ptr->max_nodes_orig; max_time = part_ptr->max_time; rc = _part_access_check(part_ptr, job_desc, req_bitmap, submit_uid, qos_ptr, assoc_ptr ? assoc_ptr->acct : NULL); if ((rc != SLURM_SUCCESS) && ((rc == ESLURM_ACCESS_DENIED) || (rc == ESLURM_USER_ID_MISSING) || (rc == ESLURM_JOB_MISSING_REQUIRED_PARTITION_GROUP) || slurmctld_conf.enforce_part_limits)) goto fini; /* Enforce Part Limit = no */ rc = SLURM_SUCCESS; } /* Validate job limits against partition limits */ /* Check Partition with the highest limits when there are muliple */ if (job_desc->min_nodes == NO_VAL) { /* Avoid setting the job request to 0 nodes unless requested */ if (!min_nodes_orig) job_desc->min_nodes = 1; else job_desc->min_nodes = min_nodes_orig; } else if ((job_desc->min_nodes > max_nodes_orig) && slurmctld_conf.enforce_part_limits && (!qos_ptr || (qos_ptr && !(qos_ptr->flags & QOS_FLAG_PART_MAX_NODE)))) { info("%s: job's min nodes greater than " "partition's max nodes (%u > %u)", __func__, job_desc->min_nodes, max_nodes_orig); rc = ESLURM_INVALID_NODE_COUNT; goto fini; } else if ((job_desc->min_nodes < min_nodes_orig) && ((job_desc->max_nodes == NO_VAL) || (job_desc->max_nodes >= min_nodes_orig))) { job_desc->min_nodes = min_nodes_orig; } if ((job_desc->max_nodes != NO_VAL) && slurmctld_conf.enforce_part_limits && (job_desc->max_nodes < min_nodes_orig) && (!qos_ptr || (qos_ptr && !(qos_ptr->flags & QOS_FLAG_PART_MIN_NODE)))) { info("%s: job's max nodes less than partition's " "min nodes (%u < %u)", __func__, job_desc->max_nodes, min_nodes_orig); rc = ESLURM_INVALID_NODE_COUNT; goto fini; } #ifndef HAVE_FRONT_END /* Zero node count OK for persistent burst buffer create or destroy */ if ((job_desc->min_nodes == 0) && (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) || (!job_desc->burst_buffer && !job_desc->script))) { info("%s: min_nodes is zero", __func__); rc = ESLURM_INVALID_NODE_COUNT; goto fini; } #endif if ((job_desc->time_limit == NO_VAL) && (part_ptr->default_time == 0)) { info("%s: job's default time is 0", __func__); rc = ESLURM_INVALID_TIME_LIMIT; goto fini; } if ((job_desc->time_limit == NO_VAL) && (part_ptr->default_time != NO_VAL)) job_desc->time_limit = part_ptr->default_time; if ((job_desc->time_min != NO_VAL) && (job_desc->time_min > max_time) && (!qos_ptr || (qos_ptr && !(qos_ptr->flags & QOS_FLAG_PART_TIME_LIMIT)))) { info("%s: job's min time greater than " "partition's (%u > %u)", __func__, job_desc->time_min, max_time); rc = ESLURM_INVALID_TIME_MIN_LIMIT; goto fini; } if ((job_desc->time_limit != NO_VAL) && (job_desc->time_limit > max_time) && (job_desc->time_min == NO_VAL) && slurmctld_conf.enforce_part_limits && (!qos_ptr || (qos_ptr && !(qos_ptr->flags & QOS_FLAG_PART_TIME_LIMIT)))) { info("%s: job's time limit greater than " "partition's (%u > %u)", __func__, job_desc->time_limit, max_time); rc = ESLURM_INVALID_TIME_LIMIT; goto fini; } if ((job_desc->time_min != NO_VAL) && (job_desc->time_min > job_desc->time_limit) && (!qos_ptr || (qos_ptr && !(qos_ptr->flags & QOS_FLAG_PART_TIME_LIMIT)))) { info("%s: job's min_time greater time limit " "(%u > %u)", __func__, job_desc->time_min, job_desc->time_limit); rc = ESLURM_INVALID_TIME_MIN_LIMIT; goto fini; } if ((job_desc->deadline) && (job_desc->deadline != NO_VAL)) { char time_str_now[32]; char time_str_deadline[32]; time_t now = time(NULL); slurm_make_time_str(&job_desc->deadline, time_str_deadline, sizeof(time_str_deadline)); slurm_make_time_str(&now, time_str_now, sizeof(time_str_now)); if (job_desc->deadline < now) { info("%s: job's deadline smaller than now (%s < %s)", __func__, time_str_deadline, time_str_now); rc = ESLURM_INVALID_TIME_LIMIT; goto fini; } if ((job_desc->time_min) && (job_desc->time_min != NO_VAL) && (job_desc->deadline < (now + job_desc->time_min * 60))) { info("%s: job's min_time greater than deadline (%u > %s)", __func__, job_desc->time_min, time_str_deadline); rc = ESLURM_INVALID_TIME_MIN_LIMIT; goto fini; } if ((job_desc->time_min == 0) && (job_desc->time_limit) && (job_desc->time_limit != NO_VAL) && (job_desc->deadline < (now + job_desc->time_limit * 60))) { info("%s: job's time_limit greater than deadline (%u > %s)", __func__, job_desc->time_limit, time_str_deadline); rc = ESLURM_INVALID_TIME_LIMIT; goto fini; } } fini: return rc; } /* * job_limits_check - check the limits specified for the job. * IN job_ptr - pointer to job table entry. * IN check_min_time - if true test job's minimum time limit, * otherwise test maximum time limit * RET WAIT_NO_REASON on success, fail status otherwise. */ extern int job_limits_check(job_record_t **job_pptr, bool check_min_time) { struct job_details *detail_ptr; enum job_state_reason fail_reason; part_record_t *part_ptr = NULL; job_record_t *job_ptr = NULL; slurmdb_qos_rec_t *qos_ptr; slurmdb_assoc_rec_t *assoc_ptr; job_desc_msg_t job_desc; int rc; job_ptr = *job_pptr; detail_ptr = job_ptr->details; part_ptr = job_ptr->part_ptr; qos_ptr = job_ptr->qos_ptr; assoc_ptr = job_ptr->assoc_ptr; if (!detail_ptr || !part_ptr) { fatal_abort("%pJ has NULL details_ptr and/or part_ptr", job_ptr); return WAIT_NO_REASON; /* To prevent CLANG error */ } fail_reason = WAIT_NO_REASON; /* * Here we need to pretend we are just submitting the job so we can * utilize the already existing function _part_access_check. If any * additional fields in that function are ever checked, the fields set * below will need to be modified. */ slurm_init_job_desc_msg(&job_desc); job_desc.reservation = job_ptr->resv_name; job_desc.user_id = job_ptr->user_id; job_desc.alloc_node = job_ptr->alloc_node; job_desc.min_cpus = detail_ptr->orig_min_cpus; job_desc.min_nodes = detail_ptr->min_nodes; /* _part_access_check looks for NO_VAL instead of 0 */ job_desc.max_nodes = detail_ptr->max_nodes ? detail_ptr->max_nodes : NO_VAL;; if (check_min_time && job_ptr->time_min) job_desc.time_limit = job_ptr->time_min; else job_desc.time_limit = job_ptr->time_limit; if ((rc = _part_access_check(part_ptr, &job_desc, NULL, job_ptr->user_id, qos_ptr, job_ptr->account))) { debug2("%pJ can't run in partition %s: %s", job_ptr, part_ptr->name, slurm_strerror(rc)); switch (rc) { case ESLURM_INVALID_TIME_LIMIT: case ESLURM_INVALID_TIME_MIN_LIMIT: if (job_ptr->limit_set.time != ADMIN_SET_LIMIT) fail_reason = WAIT_PART_TIME_LIMIT; break; case ESLURM_INVALID_NODE_COUNT: fail_reason = WAIT_PART_NODE_LIMIT; break; /* FIXME */ /* case ESLURM_TOO_MANY_REQUESTED_CPUS: */ /* failt_reason = NON_EXISTANT_WAIT_PART_CPU_LIMIT; */ /* break; */ default: fail_reason = WAIT_PART_CONFIG; break; } } else if (part_ptr->state_up == PARTITION_DOWN) { debug2("%pJ requested down partition %s", job_ptr, part_ptr->name); fail_reason = WAIT_PART_DOWN; } else if (part_ptr->state_up == PARTITION_INACTIVE) { debug2("%pJ requested inactive partition %s", job_ptr, part_ptr->name); fail_reason = WAIT_PART_INACTIVE; } else if (qos_ptr && assoc_ptr && (qos_ptr->flags & QOS_FLAG_ENFORCE_USAGE_THRES) && (!fuzzy_equal(qos_ptr->usage_thres, NO_VAL))) { if (!job_ptr->prio_factors) { job_ptr->prio_factors = xmalloc(sizeof(priority_factors_object_t)); } if (!job_ptr->prio_factors->priority_fs) { if (fuzzy_equal(assoc_ptr->usage->usage_efctv, NO_VAL)) priority_g_set_assoc_usage(assoc_ptr); job_ptr->prio_factors->priority_fs = priority_g_calc_fs_factor( assoc_ptr->usage->usage_efctv, (long double)assoc_ptr->usage-> shares_norm); } if (job_ptr->prio_factors->priority_fs < qos_ptr->usage_thres){ debug2("%pJ exceeds usage threshold", job_ptr); fail_reason = WAIT_QOS_THRES; } } else if (fail_reason == WAIT_NO_REASON) { /* * Here we need to pretend we are just submitting the job so we * can utilize the already existing function _valid_pn_min_mem. * If anything else is ever checked in that function this will * most likely have to be updated. Some of the needed members * were already initialized above to call _part_access_check, as * well as the memset for job_desc. */ if (job_ptr->bit_flags & JOB_MEM_SET) job_desc.pn_min_memory = detail_ptr->orig_pn_min_memory; else if (part_ptr->def_mem_per_cpu) job_desc.pn_min_memory = part_ptr->def_mem_per_cpu; else job_desc.pn_min_memory = slurmctld_conf.def_mem_per_cpu; if (detail_ptr->orig_cpus_per_task == NO_VAL16) job_desc.cpus_per_task = 1; else job_desc.cpus_per_task = detail_ptr->orig_cpus_per_task; if (detail_ptr->num_tasks) job_desc.num_tasks = detail_ptr->num_tasks; else { job_desc.num_tasks = job_desc.min_nodes; if (detail_ptr->ntasks_per_node != NO_VAL16) job_desc.num_tasks *= detail_ptr->ntasks_per_node; } //job_desc.min_cpus = detail_ptr->min_cpus; /* init'ed above */ job_desc.max_cpus = detail_ptr->orig_max_cpus; job_desc.shared = (uint16_t)detail_ptr->share_res; job_desc.ntasks_per_node = detail_ptr->ntasks_per_node; job_desc.pn_min_cpus = detail_ptr->orig_pn_min_cpus; job_desc.job_id = job_ptr->job_id; if (!_valid_pn_min_mem(&job_desc, part_ptr)) { /* debug2 message already logged inside the function. */ fail_reason = WAIT_PN_MEM_LIMIT; } else { /* Copy back to job_record adjusted members */ detail_ptr->pn_min_memory = job_desc.pn_min_memory; detail_ptr->cpus_per_task = job_desc.cpus_per_task; detail_ptr->min_cpus = job_desc.min_cpus; detail_ptr->max_cpus = job_desc.max_cpus; detail_ptr->pn_min_cpus = job_desc.pn_min_cpus; } } return (fail_reason); } /* * _job_create - create a job table record for the supplied specifications. * This performs only basic tests for request validity (access to * partition, nodes count in partition, and sufficient processors in * partition). * IN job_specs - job specifications * IN allocate - resource allocation request if set rather than job submit * IN will_run - job is not to be created, test of validity only * OUT job_pptr - pointer to the job (NULL on error) * OUT err_msg - Error message for user * RET 0 on success, otherwise ESLURM error code. If the job would only be * able to execute with some change in partition configuration then * ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE is returned */ static int _job_create(job_desc_msg_t *job_desc, int allocate, int will_run, job_record_t **job_pptr, uid_t submit_uid, char **err_msg, uint16_t protocol_version) { int error_code = SLURM_SUCCESS, i, qos_error; part_record_t *part_ptr = NULL; List part_ptr_list = NULL; bitstr_t *req_bitmap = NULL, *exc_bitmap = NULL; job_record_t *job_ptr = NULL; slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL; List license_list = NULL, gres_list = NULL; bool valid; slurmdb_qos_rec_t qos_rec, *qos_ptr; uint32_t user_submit_priority, acct_reason = 0; acct_policy_limit_set_t acct_policy_limit_set; memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set)); acct_policy_limit_set.tres = xcalloc(slurmctld_tres_cnt, sizeof(uint16_t)); *job_pptr = NULL; user_submit_priority = job_desc->priority; /* * Reject X11 forwarding requests from 18.08 clients since the * implementation has changed, and support for setting up tunnels in * the older style was removed with no backwards compatibility. * Remove this two versions after 19.05 is released. */ if (job_desc->x11 && (protocol_version < SLURM_19_05_PROTOCOL_VERSION)) { info("%s: cannot support X11 tunnelling from older salloc/srun", __func__); error_code = ESLURM_X11_NOT_AVAIL; goto cleanup_fail; } /* ensure that selected nodes are in this partition */ if (job_desc->req_nodes) { error_code = node_name2bitmap(job_desc->req_nodes, false, &req_bitmap); if (error_code) { error_code = ESLURM_INVALID_NODE_NAME; goto cleanup_fail; } if ((job_desc->contiguous != NO_VAL16) && (job_desc->contiguous)) bit_fill_gaps(req_bitmap); i = bit_set_count(req_bitmap); if (i > job_desc->min_nodes) job_desc->min_nodes = i; if (i > job_desc->min_cpus) job_desc->min_cpus = i; if (job_desc->max_nodes && (job_desc->min_nodes > job_desc->max_nodes)) { #if 0 info("%s: max node count less than required hostlist " "size for user %u", __func__, job_desc->user_id); job_desc->max_nodes = job_desc->min_nodes; #else error_code = ESLURM_INVALID_NODE_COUNT; goto cleanup_fail; #endif } } /* Zero node count OK for persistent burst buffer create or destroy */ if ((job_desc->max_nodes == 0) && (job_desc->array_inx || (job_desc->het_job_offset != NO_VAL) || (!job_desc->burst_buffer && !job_desc->script))) { info("%s: max_nodes is zero", __func__); error_code = ESLURM_INVALID_NODE_COUNT; goto cleanup_fail; } error_code = _get_job_parts(job_desc, &part_ptr, &part_ptr_list, err_msg); if (error_code != SLURM_SUCCESS) goto cleanup_fail; memset(&assoc_rec, 0, sizeof(assoc_rec)); assoc_rec.acct = job_desc->account; assoc_rec.partition = part_ptr->name; assoc_rec.uid = job_desc->user_id; /* * Checks are done later to validate assoc_ptr, so we don't * need to lock outside of fill_in_assoc. */ if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, &assoc_ptr, false)) { info("%s: invalid account or partition for user %u, " "account '%s', and partition '%s'", __func__, job_desc->user_id, assoc_rec.acct, assoc_rec.partition); error_code = ESLURM_INVALID_ACCOUNT; goto cleanup_fail; } else if (association_based_accounting && !assoc_ptr && !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS)) { /* * If not enforcing associations we want to look for the * default account and use it to avoid getting trash in the * accounting records. */ assoc_rec.acct = NULL; (void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, &assoc_ptr, false); if (assoc_ptr) { info("%s: account '%s' has no association for user %u " "using default account '%s'", __func__, job_desc->account, job_desc->user_id, assoc_rec.acct); xfree(job_desc->account); } } if (job_desc->account == NULL) job_desc->account = xstrdup(assoc_rec.acct); /* This must be done after we have the assoc_ptr set */ memset(&qos_rec, 0, sizeof(qos_rec)); qos_rec.name = job_desc->qos; qos_ptr = _determine_and_validate_qos( job_desc->reservation, assoc_ptr, false, &qos_rec, &qos_error, false, LOG_LEVEL_ERROR); if (qos_error != SLURM_SUCCESS) { error_code = qos_error; goto cleanup_fail; } error_code = _valid_job_part(job_desc, submit_uid, req_bitmap, part_ptr, part_ptr_list, assoc_ptr, qos_ptr); if (error_code != SLURM_SUCCESS) goto cleanup_fail; if ((error_code = _validate_job_desc(job_desc, allocate, submit_uid, part_ptr, part_ptr_list))) { goto cleanup_fail; } job_desc->tres_req_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t)); job_desc->tres_req_cnt[TRES_ARRAY_NODE] = job_desc->min_nodes; job_desc->tres_req_cnt[TRES_ARRAY_CPU] = job_desc->min_cpus; job_desc->tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(NULL, job_desc->pn_min_memory, job_desc->tres_req_cnt[TRES_ARRAY_CPU], job_desc->min_nodes); license_list = license_validate(job_desc->licenses, validate_cfgd_licenses, true, job_desc->tres_req_cnt, &valid); if (!valid) { info("Job's requested licenses are invalid: %s", job_desc->licenses); error_code = ESLURM_INVALID_LICENSES; goto cleanup_fail; } if ((error_code = gres_plugin_job_state_validate( job_desc->cpus_per_tres, job_desc->tres_freq, job_desc->tres_per_job, job_desc->tres_per_node, job_desc->tres_per_socket, job_desc->tres_per_task, job_desc->mem_per_tres, &job_desc->num_tasks, &job_desc->min_nodes, &job_desc->max_nodes, &job_desc->ntasks_per_node, &job_desc->ntasks_per_socket, &job_desc->sockets_per_node, &job_desc->cpus_per_task, &gres_list))) goto cleanup_fail; if (!valid_tres_cnt(job_desc->cpus_per_tres) || !valid_tres_cnt(job_desc->mem_per_tres) || tres_bind_verify_cmdline(job_desc->tres_bind) || tres_freq_verify_cmdline(job_desc->tres_freq) || !valid_tres_cnt(job_desc->mem_per_tres) || !valid_tres_cnt(job_desc->tres_per_job) || !valid_tres_cnt(job_desc->tres_per_node) || !valid_tres_cnt(job_desc->tres_per_socket) || !valid_tres_cnt(job_desc->tres_per_task)) { error_code = ESLURM_INVALID_TRES; goto cleanup_fail; } gres_set_job_tres_cnt(gres_list, job_desc->min_nodes, job_desc->tres_req_cnt, false); /* * Do this last,after other TRES' have been set as it uses the other * values to calculate the billing value. */ job_desc->tres_req_cnt[TRES_ARRAY_BILLING] = assoc_mgr_tres_weighted(job_desc->tres_req_cnt, part_ptr->billing_weights, slurmctld_conf.priority_flags, false); if ((error_code = bb_g_job_validate(job_desc, submit_uid)) != SLURM_SUCCESS) goto cleanup_fail; if (job_desc->deadline && (job_desc->time_limit == NO_VAL) && (job_desc->time_min == NO_VAL)) job_desc->time_min = 1; if ((accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) && (!acct_policy_validate(job_desc, part_ptr, assoc_ptr, qos_ptr, &acct_reason, &acct_policy_limit_set, 0))) { if (err_msg) { xfree(*err_msg); *err_msg = xstrdup(job_reason_string(acct_reason)); } info("%s: exceeded association/QOS limit for user %u: %s", __func__, job_desc->user_id, err_msg ? *err_msg : job_reason_string(acct_reason)); error_code = ESLURM_ACCOUNTING_POLICY; goto cleanup_fail; } if (job_desc->exc_nodes) { error_code = node_name2bitmap(job_desc->exc_nodes, false, &exc_bitmap); if (error_code) { error_code = ESLURM_INVALID_NODE_NAME; goto cleanup_fail; } } if (exc_bitmap && req_bitmap) { bitstr_t *tmp_bitmap = NULL; bitoff_t first_set; tmp_bitmap = bit_copy(exc_bitmap); bit_and(tmp_bitmap, req_bitmap); first_set = bit_ffs(tmp_bitmap); FREE_NULL_BITMAP(tmp_bitmap); if (first_set != -1) { info("Job's required and excluded node lists overlap"); error_code = ESLURM_INVALID_NODE_NAME; goto cleanup_fail; } } if (job_desc->min_nodes == NO_VAL) job_desc->min_nodes = 1; if (job_desc->max_nodes == NO_VAL) job_desc->max_nodes = 0; if (job_desc->max_nodes && (job_desc->max_nodes < job_desc->min_nodes)) { info("%s: Job's max_nodes(%u) < min_nodes(%u)", __func__, job_desc->max_nodes, job_desc->min_nodes); error_code = ESLURM_INVALID_NODE_COUNT; goto cleanup_fail; } if ((error_code = _copy_job_desc_to_job_record(job_desc, job_pptr, &req_bitmap, &exc_bitmap))) { if (error_code == SLURM_ERROR) error_code = ESLURM_ERROR_ON_DESC_TO_RECORD_COPY; job_ptr = *job_pptr; goto cleanup_fail; } job_ptr = *job_pptr; job_ptr->start_protocol_ver = protocol_version; job_ptr->part_ptr = part_ptr; job_ptr->part_ptr_list = part_ptr_list; job_ptr->bit_flags |= JOB_DEPENDENT; job_ptr->last_sched_eval = time(NULL); part_ptr_list = NULL; memcpy(&job_ptr->limit_set, &acct_policy_limit_set, sizeof(acct_policy_limit_set_t)); acct_policy_limit_set.tres = NULL; job_ptr->assoc_id = assoc_rec.id; job_ptr->assoc_ptr = (void *) assoc_ptr; job_ptr->qos_ptr = (void *) qos_ptr; job_ptr->qos_id = qos_rec.id; if (mcs_g_set_mcs_label(job_ptr, job_desc->mcs_label) != 0 ) { if (job_desc->mcs_label == NULL) { error("Failed to create job: No valid mcs_label found"); } else { error("Failed to create job: Invalid mcs-label: %s", job_desc->mcs_label); } error_code = ESLURM_INVALID_MCS_LABEL; goto cleanup_fail; } /* * Permission for altering priority was confirmed above. The job_submit * plugin may have set the priority directly or put the job on hold. If * the priority is not given, we will figure it out later after we see * if the job is eligible or not. So we want NO_VAL if not set. */ job_ptr->priority = job_desc->priority; if (job_ptr->priority == 0) { if (user_submit_priority == 0) job_ptr->state_reason = WAIT_HELD_USER; else job_ptr->state_reason = WAIT_HELD; } else if (job_ptr->priority != NO_VAL) { job_ptr->direct_set_prio = 1; } /* * The job submit plugin sets site_factor to NO_VAL so that it can * only be set the by the job submit plugin at submission. */ if (job_desc->site_factor != NO_VAL) job_ptr->site_factor = job_desc->site_factor; error_code = update_job_dependency(job_ptr, job_desc->dependency); if (error_code != SLURM_SUCCESS) goto cleanup_fail; job_ptr->details->orig_dependency = xstrdup(job_ptr->details-> dependency); if ((error_code = build_feature_list(job_ptr))) goto cleanup_fail; /* * NOTE: If this job is being used to expand another job, this job's * gres_list has already been filled in with a copy of gres_list job * to be expanded by update_job_dependency() */ if (!job_ptr->details->expanding_jobid) { job_ptr->gres_list = gres_list; gres_list = NULL; } job_ptr->gres_detail_cnt = 0; job_ptr->gres_detail_str = NULL; gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id); if ((error_code = validate_job_resv(job_ptr))) goto cleanup_fail; if (job_desc->script && (!will_run)) { /* don't bother with copy if just a test */ if ((error_code = _copy_job_desc_to_file(job_desc, job_ptr->job_id))) { error_code = ESLURM_WRITING_TO_FILE; goto cleanup_fail; } job_ptr->batch_flag = 1; } else job_ptr->batch_flag = 0; if (!will_run && (error_code = bb_g_job_validate2(job_ptr, err_msg))) goto cleanup_fail; job_ptr->license_list = license_list; license_list = NULL; if (job_desc->req_switch != NO_VAL) { /* Max # of switches */ job_ptr->req_switch = job_desc->req_switch; if (job_desc->wait4switch != NO_VAL) { job_ptr->wait4switch = _max_switch_wait(job_desc->wait4switch); } else job_ptr->wait4switch = _max_switch_wait(INFINITE); } job_ptr->best_switch = true; FREE_NULL_LIST(license_list); FREE_NULL_LIST(gres_list); FREE_NULL_BITMAP(req_bitmap); FREE_NULL_BITMAP(exc_bitmap); return error_code; cleanup_fail: if (job_ptr) { job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_SYSTEM; xfree(job_ptr->state_desc); job_ptr->start_time = job_ptr->end_time = time(NULL); purge_job_record(job_ptr->job_id); *job_pptr = NULL; } FREE_NULL_LIST(license_list); xfree(acct_policy_limit_set.tres); FREE_NULL_LIST(gres_list); FREE_NULL_LIST(part_ptr_list); FREE_NULL_BITMAP(req_bitmap); FREE_NULL_BITMAP(exc_bitmap); return error_code; } static int _test_strlen(char *test_str, char *str_name, int max_str_len) { int i = 0; if (test_str) i = strlen(test_str); if (i > max_str_len) { info("job_create_request: strlen(%s) too big (%d > %d)", str_name, i, max_str_len); return ESLURM_PATHNAME_TOO_LONG; } return SLURM_SUCCESS; } /* For each token in a comma delimited job array expression set the matching * bitmap entry */ static bool _parse_array_tok(char *tok, bitstr_t *array_bitmap, uint32_t max) { char *end_ptr = NULL; int i, first, last, step = 1; if (tok[0] == '[') /* Strip leading "[" */ tok++; first = strtol(tok, &end_ptr, 10); if (end_ptr[0] == ']') /* Strip trailing "]" */ end_ptr++; if (first < 0) return false; if (end_ptr[0] == '-') { last = strtol(end_ptr + 1, &end_ptr, 10); if (end_ptr[0] == ']') /* Strip trailing "]" */ end_ptr++; if (end_ptr[0] == ':') { step = strtol(end_ptr + 1, &end_ptr, 10); if (end_ptr[0] == ']') /* Strip trailing "]" */ end_ptr++; if ((end_ptr[0] != '\0') && (end_ptr[0] != '%')) return false; if (step <= 0) return false; } else if ((end_ptr[0] != '\0') && (end_ptr[0] != '%')) { return false; } if (last < first) return false; } else if ((end_ptr[0] != '\0') && (end_ptr[0] != '%')) { return false; } else { last = first; } if (last >= max) return false; for (i = first; i <= last; i += step) { bit_set(array_bitmap, i); } return true; } /* Translate a job array expression into the equivalent bitmap */ static bool _valid_array_inx(job_desc_msg_t *job_desc) { static time_t sched_update = 0; static uint32_t max_task_cnt = NO_VAL; uint32_t task_cnt; bool valid = true; char *tmp, *tok, *last = NULL; FREE_NULL_BITMAP(job_desc->array_bitmap); if (!job_desc->array_inx || !job_desc->array_inx[0]) return true; if (!job_desc->script || !job_desc->script[0]) return false; if (max_array_size == NO_VAL) { max_array_size = slurmctld_conf.max_array_sz; } if (max_array_size == 0) { verbose("Job arrays disabled, MaxArraySize=0"); return false; } if (sched_update != slurmctld_conf.last_update) { char *sched_params = slurm_get_sched_params(); char *key; max_task_cnt = max_array_size; sched_update = slurmctld_conf.last_update; if ((key = xstrcasestr(sched_params, "max_array_tasks="))) { key += 16; max_task_cnt = atoi(key); } xfree(sched_params); } /* We have a job array request */ job_desc->immediate = 0; /* Disable immediate option */ job_desc->array_bitmap = bit_alloc(max_array_size); tmp = xstrdup(job_desc->array_inx); tok = strtok_r(tmp, ",", &last); while (tok && valid) { valid = _parse_array_tok(tok, job_desc->array_bitmap, max_array_size); tok = strtok_r(NULL, ",", &last); } xfree(tmp); if (valid && (max_task_cnt < max_array_size)) { task_cnt = bit_set_count(job_desc->array_bitmap); if (task_cnt > max_task_cnt) { debug("max_array_tasks exceeded (%u > %u)", task_cnt, max_task_cnt); valid = false; } } return valid; } /* Make sure a job descriptor's strings are not huge, which could result in * a denial of service attack due to memory demands by the slurmctld */ static int _test_job_desc_fields(job_desc_msg_t * job_desc) { static int max_script = -1; if (max_script == -1) { char *sched_params = slurm_get_sched_params(); char *tmp_ptr; max_script = 4 * 1024 * 1024; if ((tmp_ptr = xstrcasestr(sched_params, "max_script_size="))) { max_script = atoi(tmp_ptr + 16); } xfree(sched_params); } if (_test_strlen(job_desc->account, "account", 1024) || _test_strlen(job_desc->alloc_node, "alloc_node", 1024) || _test_strlen(job_desc->array_inx, "array_inx", 1024 * 4) || _test_strlen(job_desc->burst_buffer, "burst_buffer",1024*8) || _test_strlen(job_desc->comment, "comment", 1024) || _test_strlen(job_desc->cpu_bind, "cpu-bind", 1024 * 128) || _test_strlen(job_desc->cpus_per_tres, "cpus_per_tres", 1024)|| _test_strlen(job_desc->dependency, "dependency", 1024*128) || _test_strlen(job_desc->features, "features", 1024) || _test_strlen( job_desc->cluster_features, "cluster_features", 1024) || _test_strlen(job_desc->licenses, "licenses", 1024) || _test_strlen(job_desc->mail_user, "mail_user", 1024) || _test_strlen(job_desc->mcs_label, "mcs_label", 1024) || _test_strlen(job_desc->mem_bind, "mem-bind", 1024 * 128) || _test_strlen(job_desc->mem_per_tres, "mem_per_tres", 1024) || _test_strlen(job_desc->name, "name", 1024) || _test_strlen(job_desc->network, "network", 1024) || _test_strlen(job_desc->partition, "partition", 1024) || _test_strlen(job_desc->qos, "qos", 1024) || _test_strlen(job_desc->reservation, "reservation", 1024) || _test_strlen(job_desc->script, "script", max_script) || _test_strlen(job_desc->std_err, "std_err", MAXPATHLEN) || _test_strlen(job_desc->std_in, "std_in", MAXPATHLEN) || _test_strlen(job_desc->std_out, "std_out", MAXPATHLEN) || _test_strlen(job_desc->tres_bind, "tres_bind", 1024) || _test_strlen(job_desc->tres_freq, "tres_freq", 1024) || _test_strlen(job_desc->tres_per_job, "tres_per_job", 1024) || _test_strlen(job_desc->tres_per_node, "tres_per_node", 1024)|| _test_strlen(job_desc->tres_per_socket, "tres_per_socket", 1024) || _test_strlen(job_desc->tres_per_task, "tres_per_task", 1024)|| _test_strlen(job_desc->wckey, "wckey", 1024) || _test_strlen(job_desc->work_dir, "work_dir", MAXPATHLEN)) return ESLURM_PATHNAME_TOO_LONG; return SLURM_SUCCESS; } /* Perform some size checks on strings we store to prevent * malicious user filling slurmctld's memory * IN job_desc - user job submit request * IN submit_uid - UID making job submit request * OUT err_msg - custom error message to return * RET 0 or error code */ extern int validate_job_create_req(job_desc_msg_t * job_desc, uid_t submit_uid, char **err_msg) { int rc; /* * Check user permission for negative 'nice' and non-0 priority values * (restricted to root, SlurmUser, or SLURMDB_ADMIN_OPERATOR) _before_ * running the job_submit plugin. */ if (!validate_operator(submit_uid)) { if (job_desc->priority != 0) job_desc->priority = NO_VAL; if (job_desc->nice < NICE_OFFSET) return ESLURM_INVALID_NICE; } if (!validate_super_user(submit_uid)) { /* AdminComment can only be set by an Admin. */ if (job_desc->admin_comment) return ESLURM_ACCESS_DENIED; if (job_desc->reboot && (job_desc->reboot != NO_VAL16)) { *err_msg = xstrdup("rebooting of nodes is only allowed for admins"); return ESLURM_ACCESS_DENIED; } } rc = job_submit_plugin_submit(job_desc, (uint32_t) submit_uid, err_msg); if (rc != SLURM_SUCCESS) return rc; rc = node_features_g_job_valid(job_desc->features); if (rc != SLURM_SUCCESS) return rc; rc = _test_job_desc_fields(job_desc); if (rc != SLURM_SUCCESS) return rc; if (!_valid_array_inx(job_desc)) return ESLURM_INVALID_ARRAY; if (job_desc->x11 && !(slurmctld_conf.prolog_flags & PROLOG_FLAG_X11)) return ESLURM_X11_NOT_AVAIL; /* Make sure anything that may be put in the database will be * lower case */ xstrtolower(job_desc->account); xstrtolower(job_desc->wckey); /* Basic validation of some parameters */ if (job_desc->req_nodes) { hostlist_t hl; uint32_t host_cnt; hl = hostlist_create(job_desc->req_nodes); if (hl == NULL) { /* likely a badly formatted hostlist */ error("validate_job_create_req: bad hostlist"); return ESLURM_INVALID_NODE_NAME; } host_cnt = hostlist_count(hl); hostlist_destroy(hl); if ((job_desc->min_nodes == NO_VAL) || (job_desc->min_nodes < host_cnt)) job_desc->min_nodes = host_cnt; } /* If max nodes is different than min nodes don't set tasks or * it will hard code the range. */ if ((job_desc->ntasks_per_node != NO_VAL16) && (job_desc->min_nodes != NO_VAL) && (job_desc->num_tasks == NO_VAL)) { job_desc->num_tasks = job_desc->ntasks_per_node * job_desc->min_nodes; } /* Only set min and max cpus if overcommit isn't set */ if ((job_desc->overcommit == NO_VAL8) && (job_desc->min_cpus != NO_VAL) && (job_desc->num_tasks != NO_VAL) && (job_desc->num_tasks > job_desc->min_cpus)) { if (job_desc->num_tasks != NO_VAL) job_desc->min_cpus = job_desc->num_tasks; else if (job_desc->min_nodes != NO_VAL) job_desc->min_cpus = job_desc->min_nodes; else job_desc->min_cpus = 1; if (job_desc->cpus_per_task != NO_VAL16) job_desc->min_cpus *= job_desc->cpus_per_task; /* This is just a sanity check as we wouldn't ever have a * max_cpus if we didn't have a min_cpus. */ if ((job_desc->max_cpus != NO_VAL) && (job_desc->max_cpus < job_desc->min_cpus)) job_desc->max_cpus = job_desc->min_cpus; } if (job_desc->reboot && (job_desc->reboot != NO_VAL16)) job_desc->shared = 0; return SLURM_SUCCESS; } /* _copy_job_desc_to_file - copy the job script and environment from the RPC * structure into a file */ static int _copy_job_desc_to_file(job_desc_msg_t * job_desc, uint32_t job_id) { int error_code = 0, hash; char *dir_name, *file_name; DEF_TIMERS; START_TIMER; if (!job_desc->environment || job_desc->env_size == 0) { error("%s: batch job cannot run without an environment", __func__); return ESLURM_ENVIRONMENT_MISSING; } /* Create directory based upon job ID due to limitations on the number * of files possible in a directory on some file system types (e.g. * up to 64k files on a FAT32 file system). */ hash = job_id % 10; dir_name = xstrdup_printf("%s/hash.%d", slurmctld_conf.state_save_location, hash); (void) mkdir(dir_name, 0700); /* Create job_id specific directory */ xstrfmtcat(dir_name, "/job.%u", job_id); if (mkdir(dir_name, 0700)) { if (!slurmctld_primary && (errno == EEXIST)) { error("Apparent duplicate JobId=%u. Two primary slurmctld daemons might currently be active", job_id); } error("mkdir(%s) error %m", dir_name); xfree(dir_name); return ESLURM_WRITING_TO_FILE; } /* Create environment file, and write data to it */ file_name = xstrdup_printf("%s/environment", dir_name); error_code = _write_data_array_to_file(file_name, job_desc->environment, job_desc->env_size); xfree(file_name); if (error_code == 0) { /* Create script file */ file_name = xstrdup_printf("%s/script", dir_name); error_code = _write_data_to_file(file_name, job_desc->script); xfree(file_name); } xfree(dir_name); END_TIMER2("_copy_job_desc_to_file"); return error_code; } /* Return true of the specified job ID already has a batch directory so * that a different job ID can be created. This is to help limit damage from * split-brain, where two slurmctld daemons are running as primary. */ static bool _dup_job_file_test(uint32_t job_id) { char *dir_name_src; struct stat buf; int rc, hash = job_id % 10; dir_name_src = xstrdup_printf("%s/hash.%d/job.%u", slurmctld_conf.state_save_location, hash, job_id); rc = stat(dir_name_src, &buf); xfree(dir_name_src); if (rc == 0) { error("Vestigial state files for JobId=%u, but no job record. This may be the result of two slurmctld running in primary mode", job_id); return true; } return false; } /* * Create file with specified name and write the supplied data array to it * IN file_name - file to create and write to * IN data - array of pointers to strings (e.g. env) * IN size - number of elements in data */ static int _write_data_array_to_file(char *file_name, char **data, uint32_t size) { int fd, i, pos, nwrite, amount; fd = creat(file_name, 0600); if (fd < 0) { error("Error creating file %s, %m", file_name); return ESLURM_WRITING_TO_FILE; } amount = write(fd, &size, sizeof(uint32_t)); if (amount < sizeof(uint32_t)) { error("Error writing file %s, %m", file_name); close(fd); return ESLURM_WRITING_TO_FILE; } if (data == NULL) { close(fd); return SLURM_SUCCESS; } for (i = 0; i < size; i++) { nwrite = strlen(data[i]) + 1; pos = 0; while (nwrite > 0) { amount = write(fd, &data[i][pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("Error writing file %s, %m", file_name); close(fd); return ESLURM_WRITING_TO_FILE; } nwrite -= amount; pos += amount; } } close(fd); return SLURM_SUCCESS; } /* * Create file with specified name and write the supplied data array to it * IN file_name - file to create and write to * IN data - pointer to string */ static int _write_data_to_file(char *file_name, char *data) { int fd, pos, nwrite, amount; if (data == NULL) { (void) unlink(file_name); return SLURM_SUCCESS; } fd = creat(file_name, 0700); if (fd < 0) { error("Error creating file %s, %m", file_name); return ESLURM_WRITING_TO_FILE; } nwrite = strlen(data) + 1; pos = 0; while (nwrite > 0) { amount = write(fd, &data[pos], nwrite); if ((amount < 0) && (errno != EINTR)) { error("Error writing file %s, %m", file_name); close(fd); return ESLURM_WRITING_TO_FILE; } nwrite -= amount; pos += amount; } close(fd); return SLURM_SUCCESS; } /* * get_job_env - return the environment variables and their count for a * given job * IN job_ptr - pointer to job for which data is required * OUT env_size - number of elements to read * RET point to array of string pointers containing environment variables */ char **get_job_env(job_record_t *job_ptr, uint32_t *env_size) { char *file_name = NULL, **environment = NULL; int cc, fd = -1, hash; uint32_t use_id; use_id = (job_ptr->array_task_id != NO_VAL) ? job_ptr->array_job_id : job_ptr->job_id; hash = use_id % 10; file_name = xstrdup_printf("%s/hash.%d/job.%u/environment", slurmctld_conf.state_save_location, hash, use_id); fd = open(file_name, 0); if (fd >= 0) { cc = _read_data_array_from_file(fd, file_name, &environment, env_size, job_ptr); if (cc < 0) environment = NULL; close(fd); } else { error("Could not open environment file for %pJ", job_ptr); } xfree(file_name); return environment; } /* * get_job_script - return the script for a given job * IN job_ptr - pointer to job for which data is required * RET Buf containing job script */ Buf get_job_script(const job_record_t *job_ptr) { char *file_name = NULL; int hash; uint32_t use_id; Buf buf; if (!job_ptr->batch_flag) return NULL; use_id = (job_ptr->array_task_id != NO_VAL) ? job_ptr->array_job_id : job_ptr->job_id; hash = use_id % 10; file_name = xstrdup_printf("%s/hash.%d/job.%u/script", slurmctld_conf.state_save_location, hash, use_id); if (!(buf = create_mmap_buf(file_name))) error("Could not open script file for %pJ", job_ptr); xfree(file_name); return buf; } /* * Read a collection of strings from a file * IN fd - file descriptor * IN file_name - file to read from * OUT data - pointer to array of pointers to strings (e.g. env), * must be xfreed when no longer needed * OUT size - number of elements in data * IN job_ptr - job * RET 0 on success, -1 on error * NOTE: The output format of this must be identical with _xduparray2() */ static int _read_data_array_from_file(int fd, char *file_name, char ***data, uint32_t *size, job_record_t *job_ptr) { int pos, buf_size, amount, i, j; char *buffer, **array_ptr; uint32_t rec_cnt; xassert(file_name); xassert(data); xassert(size); *data = NULL; *size = 0; amount = read(fd, &rec_cnt, sizeof(uint32_t)); if (amount < sizeof(uint32_t)) { if (amount != 0) /* incomplete write */ error("Error reading file %s, %m", file_name); else verbose("File %s has zero size", file_name); return -1; } if (rec_cnt >= INT_MAX) { error("%s: unreasonable record counter %d in file %s", __func__, rec_cnt, file_name); return -1; } if (rec_cnt == 0) { *data = NULL; *size = 0; return 0; } pos = 0; buf_size = BUF_SIZE; buffer = xmalloc(buf_size + 1); while (1) { amount = read(fd, &buffer[pos], BUF_SIZE); if (amount < 0) { error("Error reading file %s, %m", file_name); xfree(buffer); return -1; } buffer[pos + amount] = '\0'; pos += amount; if (amount < BUF_SIZE) /* end of file */ break; buf_size += amount; xrealloc(buffer, buf_size + 1); } /* Allocate extra space for supplemental environment variables */ if (job_ptr->details->env_cnt) { for (j = 0; j < job_ptr->details->env_cnt; j++) pos += (strlen(job_ptr->details->env_sup[j]) + 1); xrealloc(buffer, pos); } /* We have all the data, now let's compute the pointers */ array_ptr = xcalloc((rec_cnt + job_ptr->details->env_cnt), sizeof(char *)); for (i = 0, pos = 0; i < rec_cnt; i++) { array_ptr[i] = &buffer[pos]; pos += strlen(&buffer[pos]) + 1; if ((pos > buf_size) && ((i + 1) < rec_cnt)) { error("Bad environment file %s", file_name); rec_cnt = i; break; } } /* Add supplemental environment variables */ if (job_ptr->details->env_cnt) { char *tmp_chr; int env_len, name_len; for (j = 0; j < job_ptr->details->env_cnt; j++) { tmp_chr = strchr(job_ptr->details->env_sup[j], '='); if (tmp_chr == NULL) { error("Invalid supplemental environment " "variable: %s", job_ptr->details->env_sup[j]); continue; } env_len = strlen(job_ptr->details->env_sup[j]) + 1; name_len = tmp_chr - job_ptr->details->env_sup[j] + 1; /* search for duplicate */ for (i = 0; i < rec_cnt; i++) { if (xstrncmp(array_ptr[i], job_ptr->details->env_sup[j], name_len)) { continue; } /* * If we are are the front we can not overwrite * that spot, we can clear it an then add to the * end of the array. */ if (i == 0) { array_ptr[0][0] = '\0'; i = rec_cnt; break; } /* over-write duplicate */ memcpy(&buffer[pos], job_ptr->details->env_sup[j], env_len); array_ptr[i] = &buffer[pos]; pos += env_len; break; } if (i >= rec_cnt) { /* add env to array end */ memcpy(&buffer[pos], job_ptr->details->env_sup[j], env_len); array_ptr[rec_cnt++] = &buffer[pos]; pos += env_len; } } } *size = rec_cnt; *data = array_ptr; return 0; } /* Given a job request, return a multi_core_data struct. * Returns NULL if no values set in the job/step request */ static multi_core_data_t * _set_multi_core_data(job_desc_msg_t * job_desc) { multi_core_data_t * mc_ptr; if ((job_desc->sockets_per_node == NO_VAL16) && (job_desc->cores_per_socket == NO_VAL16) && (job_desc->threads_per_core == NO_VAL16) && (job_desc->ntasks_per_socket == NO_VAL16) && (job_desc->ntasks_per_core == NO_VAL16) && (job_desc->plane_size == NO_VAL16)) return NULL; mc_ptr = xmalloc(sizeof(multi_core_data_t)); mc_ptr->sockets_per_node = job_desc->sockets_per_node; mc_ptr->cores_per_socket = job_desc->cores_per_socket; mc_ptr->threads_per_core = job_desc->threads_per_core; if (job_desc->ntasks_per_socket != NO_VAL16) mc_ptr->ntasks_per_socket = job_desc->ntasks_per_socket; else mc_ptr->ntasks_per_socket = INFINITE16; if (job_desc->ntasks_per_core != NO_VAL16) mc_ptr->ntasks_per_core = job_desc->ntasks_per_core; else if (slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE) mc_ptr->ntasks_per_core = 1; else mc_ptr->ntasks_per_core = INFINITE16; if (job_desc->plane_size != NO_VAL16) mc_ptr->plane_size = job_desc->plane_size; else mc_ptr->plane_size = 0; return mc_ptr; } /* Return default "wait_all_nodes" option for a new job */ static uint16_t _default_wait_all_nodes(job_desc_msg_t *job_desc) { static uint16_t default_batch_wait = NO_VAL16; static time_t sched_update = 0; char *sched_params; if (!job_desc->script) return 0; if ((default_batch_wait != NO_VAL16) && (sched_update == slurmctld_conf.last_update)) return default_batch_wait; sched_params = slurm_get_sched_params(); if (xstrcasestr(sched_params, "sbatch_wait_nodes")) default_batch_wait = 1; else default_batch_wait = 0; xfree(sched_params); sched_update = slurmctld_conf.last_update; return default_batch_wait; } /* _copy_job_desc_to_job_record - copy the job descriptor from the RPC * structure into the actual slurmctld job record */ static int _copy_job_desc_to_job_record(job_desc_msg_t *job_desc, job_record_t **job_rec_ptr, bitstr_t **req_bitmap, bitstr_t **exc_bitmap) { int error_code; struct job_details *detail_ptr; job_record_t *job_ptr; if (slurm_get_track_wckey()) { if (!job_desc->wckey) { /* get the default wckey for this user since none was * given */ slurmdb_user_rec_t user_rec; memset(&user_rec, 0, sizeof(user_rec)); user_rec.uid = job_desc->user_id; assoc_mgr_fill_in_user(acct_db_conn, &user_rec, accounting_enforce, NULL, false); if (user_rec.default_wckey) job_desc->wckey = xstrdup_printf( "*%s", user_rec.default_wckey); else if (!(accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS)) job_desc->wckey = xstrdup("*"); else { error("Job didn't specify wckey and user " "%d has no default.", job_desc->user_id); return ESLURM_INVALID_WCKEY; } } else if (job_desc->wckey) { slurmdb_wckey_rec_t wckey_rec, *wckey_ptr = NULL; memset(&wckey_rec, 0, sizeof(wckey_rec)); wckey_rec.uid = job_desc->user_id; wckey_rec.name = job_desc->wckey; if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, accounting_enforce, &wckey_ptr, false)) { if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) { error("%s: invalid wckey '%s' for " "user %u.", __func__, wckey_rec.name, job_desc->user_id); return ESLURM_INVALID_WCKEY; } } } else if (accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS) { /* This should never happen */ info("%s: no wckey was given for job submit", __func__); return ESLURM_INVALID_WCKEY; } } job_ptr = _create_job_record(1); if (!job_ptr) return SLURM_ERROR; *job_rec_ptr = job_ptr; job_ptr->partition = xstrdup(job_desc->partition); if (job_desc->profile != ACCT_GATHER_PROFILE_NOT_SET) job_ptr->profile = job_desc->profile; if (job_desc->job_id != NO_VAL) { /* already confirmed unique */ job_ptr->job_id = job_desc->job_id; } else { error_code = _set_job_id(job_ptr); if (error_code) return error_code; } job_ptr->name = xstrdup(job_desc->name); job_ptr->wckey = xstrdup(job_desc->wckey); /* Since this is only used in the slurmctld, copy it now. */ job_ptr->tres_req_cnt = job_desc->tres_req_cnt; job_desc->tres_req_cnt = NULL; set_job_tres_req_str(job_ptr, false); _add_job_hash(job_ptr); job_ptr->user_id = (uid_t) job_desc->user_id; job_ptr->group_id = (gid_t) job_desc->group_id; job_ptr->job_state = JOB_PENDING; job_ptr->time_limit = job_desc->time_limit; job_ptr->deadline = job_desc->deadline; if (job_desc->delay_boot == NO_VAL) job_ptr->delay_boot = delay_boot; else job_ptr->delay_boot = job_desc->delay_boot; if (job_desc->time_min != NO_VAL) job_ptr->time_min = job_desc->time_min; job_ptr->alloc_sid = job_desc->alloc_sid; job_ptr->alloc_node = xstrdup(job_desc->alloc_node); job_ptr->account = xstrdup(job_desc->account); job_ptr->batch_features = xstrdup(job_desc->batch_features); job_ptr->burst_buffer = xstrdup(job_desc->burst_buffer); job_ptr->network = xstrdup(job_desc->network); job_ptr->resv_name = xstrdup(job_desc->reservation); job_ptr->restart_cnt = job_desc->restart_cnt; job_ptr->comment = xstrdup(job_desc->comment); job_ptr->admin_comment = xstrdup(job_desc->admin_comment); if (job_desc->kill_on_node_fail != NO_VAL16) job_ptr->kill_on_node_fail = job_desc->kill_on_node_fail; job_ptr->resp_host = xstrdup(job_desc->resp_host); job_ptr->alloc_resp_port = job_desc->alloc_resp_port; job_ptr->other_port = job_desc->other_port; job_ptr->power_flags = job_desc->power_flags; job_ptr->time_last_active = time(NULL); job_ptr->cr_enabled = 0; job_ptr->derived_ec = 0; job_ptr->licenses = xstrdup(job_desc->licenses); if (job_desc->mail_type && (job_desc->mail_type != NO_VAL16)) { job_ptr->mail_type = job_desc->mail_type; job_ptr->mail_user = _get_mail_user(job_desc->mail_user, job_ptr->user_id); } job_ptr->bit_flags = job_desc->bitflags; job_ptr->bit_flags &= ~BACKFILL_TEST; job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST; job_ptr->spank_job_env = job_desc->spank_job_env; job_ptr->spank_job_env_size = job_desc->spank_job_env_size; job_desc->spank_job_env = (char **) NULL; /* nothing left to free */ job_desc->spank_job_env_size = 0; /* nothing left to free */ job_ptr->mcs_label = xstrdup(job_desc->mcs_label); job_ptr->origin_cluster = xstrdup(job_desc->origin_cluster); job_ptr->cpus_per_tres = xstrdup(job_desc->cpus_per_tres); job_ptr->mem_per_tres = xstrdup(job_desc->mem_per_tres); job_ptr->tres_bind = xstrdup(job_desc->tres_bind); job_ptr->tres_freq = xstrdup(job_desc->tres_freq); job_ptr->tres_per_job = xstrdup(job_desc->tres_per_job); job_ptr->tres_per_node = xstrdup(job_desc->tres_per_node); job_ptr->tres_per_socket = xstrdup(job_desc->tres_per_socket); job_ptr->tres_per_task = xstrdup(job_desc->tres_per_task); if (job_desc->wait_all_nodes == NO_VAL16) job_ptr->wait_all_nodes = _default_wait_all_nodes(job_desc); else job_ptr->wait_all_nodes = job_desc->wait_all_nodes; job_ptr->warn_flags = job_desc->warn_flags; job_ptr->warn_signal = job_desc->warn_signal; job_ptr->warn_time = job_desc->warn_time; detail_ptr = job_ptr->details; detail_ptr->argc = job_desc->argc; detail_ptr->argv = job_desc->argv; job_desc->argv = (char **) NULL; /* nothing left to free */ job_desc->argc = 0; /* nothing left to free */ detail_ptr->acctg_freq = xstrdup(job_desc->acctg_freq); detail_ptr->cpu_bind_type = job_desc->cpu_bind_type; detail_ptr->cpu_bind = xstrdup(job_desc->cpu_bind); detail_ptr->cpu_freq_gov = job_desc->cpu_freq_gov; detail_ptr->cpu_freq_max = job_desc->cpu_freq_max; detail_ptr->cpu_freq_min = job_desc->cpu_freq_min; detail_ptr->extra = job_desc->extra; detail_ptr->nice = job_desc->nice; detail_ptr->open_mode = job_desc->open_mode; detail_ptr->min_cpus = job_desc->min_cpus; detail_ptr->orig_min_cpus = job_desc->min_cpus; detail_ptr->max_cpus = job_desc->max_cpus; detail_ptr->orig_max_cpus = job_desc->max_cpus; detail_ptr->min_nodes = job_desc->min_nodes; detail_ptr->max_nodes = job_desc->max_nodes; detail_ptr->x11 = job_desc->x11; detail_ptr->x11_magic_cookie = xstrdup(job_desc->x11_magic_cookie); detail_ptr->x11_target = xstrdup(job_desc->x11_target); detail_ptr->x11_target_port = job_desc->x11_target_port; if (job_desc->req_nodes) { detail_ptr->req_nodes = _copy_nodelist_no_dup(job_desc->req_nodes); detail_ptr->req_node_bitmap = *req_bitmap; *req_bitmap = NULL; /* Reused nothing left to free */ } if (job_desc->exc_nodes) { detail_ptr->exc_nodes = _copy_nodelist_no_dup(job_desc->exc_nodes); detail_ptr->exc_node_bitmap = *exc_bitmap; *exc_bitmap = NULL; /* Reused nothing left to free */ } detail_ptr->features = xstrdup(job_desc->features); detail_ptr->cluster_features = xstrdup(job_desc->cluster_features); if (job_desc->fed_siblings_viable) { job_ptr->fed_details = xmalloc(sizeof(job_fed_details_t)); job_ptr->fed_details->siblings_viable = job_desc->fed_siblings_viable; update_job_fed_details(job_ptr); } if (job_desc->shared == JOB_SHARED_NONE) { detail_ptr->share_res = 0; detail_ptr->whole_node = WHOLE_NODE_REQUIRED; } else if (job_desc->shared == JOB_SHARED_OK) { detail_ptr->share_res = 1; detail_ptr->whole_node = 0; } else if (job_desc->shared == JOB_SHARED_USER) { detail_ptr->share_res = NO_VAL8; detail_ptr->whole_node = WHOLE_NODE_USER; } else if (job_desc->shared == JOB_SHARED_MCS) { detail_ptr->share_res = NO_VAL8; detail_ptr->whole_node = WHOLE_NODE_MCS; } else { detail_ptr->share_res = NO_VAL8; detail_ptr->whole_node = 0; } if (job_desc->contiguous != NO_VAL16) detail_ptr->contiguous = job_desc->contiguous; if (slurmctld_conf.conf_flags & CTL_CONF_ASRU) detail_ptr->core_spec = job_desc->core_spec; else detail_ptr->core_spec = NO_VAL16; if (detail_ptr->core_spec != NO_VAL16) detail_ptr->whole_node = 1; if (job_desc->task_dist != NO_VAL) detail_ptr->task_dist = job_desc->task_dist; if (job_desc->cpus_per_task == NO_VAL16) { detail_ptr->cpus_per_task = 1; detail_ptr->orig_cpus_per_task = NO_VAL16; } else { detail_ptr->cpus_per_task = MAX(job_desc->cpus_per_task, 1); detail_ptr->orig_cpus_per_task = detail_ptr->cpus_per_task; } if (job_desc->pn_min_cpus != NO_VAL16) detail_ptr->pn_min_cpus = job_desc->pn_min_cpus; if (job_desc->overcommit != NO_VAL8) detail_ptr->overcommit = job_desc->overcommit; if (job_desc->num_tasks != NO_VAL) detail_ptr->num_tasks = job_desc->num_tasks; if (job_desc->ntasks_per_node != NO_VAL16) { detail_ptr->ntasks_per_node = job_desc->ntasks_per_node; if ((detail_ptr->overcommit == 0) && (detail_ptr->num_tasks > 1)) { detail_ptr->pn_min_cpus = MAX(detail_ptr->pn_min_cpus, (detail_ptr->cpus_per_task * detail_ptr->ntasks_per_node)); } } detail_ptr->pn_min_cpus = MAX(detail_ptr->pn_min_cpus, detail_ptr->cpus_per_task); detail_ptr->orig_pn_min_cpus = detail_ptr->pn_min_cpus; if (job_desc->reboot != NO_VAL16) job_ptr->reboot = MIN(job_desc->reboot, 1); else job_ptr->reboot = 0; if (job_desc->requeue != NO_VAL16) detail_ptr->requeue = MIN(job_desc->requeue, 1); else detail_ptr->requeue = slurmctld_conf.job_requeue; if (job_desc->pn_min_memory != NO_VAL64) detail_ptr->pn_min_memory = job_desc->pn_min_memory; detail_ptr->orig_pn_min_memory = detail_ptr->pn_min_memory; if (job_desc->pn_min_tmp_disk != NO_VAL) detail_ptr->pn_min_tmp_disk = job_desc->pn_min_tmp_disk; detail_ptr->std_err = xstrdup(job_desc->std_err); detail_ptr->std_in = xstrdup(job_desc->std_in); detail_ptr->std_out = xstrdup(job_desc->std_out); detail_ptr->work_dir = xstrdup(job_desc->work_dir); if (job_desc->begin_time > time(NULL)) detail_ptr->begin_time = job_desc->begin_time; job_ptr->select_jobinfo = select_g_select_jobinfo_copy(job_desc->select_jobinfo); select_g_select_jobinfo_set(job_ptr->select_jobinfo, SELECT_JOBDATA_NETWORK, job_ptr->network); job_ptr->clusters = xstrdup(job_desc->clusters); /* * The priority needs to be set after this since we don't have * an association rec yet */ detail_ptr->mc_ptr = _set_multi_core_data(job_desc); if ((job_ptr->bit_flags & SPREAD_JOB) && (detail_ptr->max_nodes == 0) && (detail_ptr->num_tasks != 0)) { if (detail_ptr->min_nodes == 0) detail_ptr->min_nodes = 1; detail_ptr->max_nodes = MIN(node_record_count, detail_ptr->num_tasks); } return SLURM_SUCCESS; } /* * _copy_nodelist_no_dup - Take a node_list string and convert it to an * expression without duplicate names. For example, we want to convert * a users request for nodes "lx1,lx2,lx1,lx3" to "lx[1-3]" * node_list IN - string describing a list of nodes * RET a compact node expression, must be xfreed by the user */ static char *_copy_nodelist_no_dup(char *node_list) { char *buf; hostlist_t hl = hostlist_create(node_list); if (hl == NULL) return NULL; hostlist_uniq(hl); buf = hostlist_ranged_string_xmalloc(hl); hostlist_destroy(hl); return buf; } /* Return the number of CPUs on the first node in the identified partition */ static uint16_t _cpus_per_node_part(part_record_t *part_ptr) { int node_inx = -1; node_record_t *node_ptr; if (part_ptr->node_bitmap) node_inx = bit_ffs(part_ptr->node_bitmap); if (node_inx >= 0) { node_ptr = node_record_table_ptr + node_inx; return node_ptr->config_ptr->cpus; } return 0; } /* * Test if this job exceeds any of MaxMemPer[CPU|Node] limits and potentially * adjust mem / cpu ratios. * * NOTE: This function is also called with a dummy job_desc_msg_t from * job_limits_check(), if there is any new check added here you may also have to * add that parameter to the job_desc_msg_t in that function. */ static bool _valid_pn_min_mem(job_desc_msg_t *job_desc_msg, part_record_t *part_ptr) { uint64_t job_mem_limit = job_desc_msg->pn_min_memory; uint64_t sys_mem_limit; uint16_t cpus_per_node; if (part_ptr && part_ptr->max_mem_per_cpu) sys_mem_limit = part_ptr->max_mem_per_cpu; else sys_mem_limit = slurmctld_conf.max_mem_per_cpu; if ((sys_mem_limit == 0) || (sys_mem_limit == MEM_PER_CPU)) return true; if ((job_mem_limit & MEM_PER_CPU) && (sys_mem_limit & MEM_PER_CPU)) { uint64_t mem_ratio; job_mem_limit &= (~MEM_PER_CPU); sys_mem_limit &= (~MEM_PER_CPU); if (job_mem_limit <= sys_mem_limit) return true; mem_ratio = (job_mem_limit + sys_mem_limit - 1); mem_ratio /= sys_mem_limit; debug("increasing cpus_per_task and decreasing mem_per_cpu by " "factor of %"PRIu64" based upon mem_per_cpu limits", mem_ratio); if (job_desc_msg->cpus_per_task == NO_VAL16) job_desc_msg->cpus_per_task = mem_ratio; else job_desc_msg->cpus_per_task *= mem_ratio; job_desc_msg->pn_min_memory = ((job_mem_limit + mem_ratio - 1) / mem_ratio) | MEM_PER_CPU; if ((job_desc_msg->num_tasks != NO_VAL) && (job_desc_msg->num_tasks != 0) && (job_desc_msg->min_cpus != NO_VAL)) { job_desc_msg->min_cpus = job_desc_msg->num_tasks * job_desc_msg->cpus_per_task; if ((job_desc_msg->max_cpus != NO_VAL) && (job_desc_msg->max_cpus < job_desc_msg->min_cpus)) { job_desc_msg->max_cpus = job_desc_msg->min_cpus; } } return true; } if (((job_mem_limit & MEM_PER_CPU) == 0) && ((sys_mem_limit & MEM_PER_CPU) == 0)) { if (job_mem_limit <= sys_mem_limit) return true; debug2("JobId=%u mem=%"PRIu64"M > MaxMemPerNode=%"PRIu64"M in partition %s", job_desc_msg->job_id, job_mem_limit, sys_mem_limit, (part_ptr && part_ptr->name) ? part_ptr->name : "N/A"); return false; } /* Job and system have different memory limit forms (i.e. one is a * per-job and the other is per-node). Covert them both to per-node * values for comparison. */ if (part_ptr && (!part_ptr->max_share || !job_desc_msg->shared)) { /* Whole node allocation */ cpus_per_node = _cpus_per_node_part(part_ptr); } else { if ((job_desc_msg->ntasks_per_node != NO_VAL16) && (job_desc_msg->ntasks_per_node != 0)) cpus_per_node = job_desc_msg->ntasks_per_node; else cpus_per_node = 1; if ((job_desc_msg->num_tasks != NO_VAL) && (job_desc_msg->num_tasks != 0) && (job_desc_msg->max_nodes != NO_VAL) && (job_desc_msg->max_nodes != 0)) { cpus_per_node = MAX(cpus_per_node, ((job_desc_msg->num_tasks + job_desc_msg->max_nodes - 1) / job_desc_msg->max_nodes)); } if ((job_desc_msg->cpus_per_task != NO_VAL16) && (job_desc_msg->cpus_per_task != 0)) cpus_per_node *= job_desc_msg->cpus_per_task; if ((job_desc_msg->pn_min_cpus != NO_VAL16) && (job_desc_msg->pn_min_cpus > cpus_per_node)) cpus_per_node = job_desc_msg->pn_min_cpus; } if (job_mem_limit & MEM_PER_CPU) { /* Job has per-CPU memory limit, system has per-node limit */ job_mem_limit &= (~MEM_PER_CPU); job_mem_limit *= cpus_per_node; } else { /* Job has per-node memory limit, system has per-CPU limit */ uint32_t min_cpus; sys_mem_limit &= (~MEM_PER_CPU); min_cpus = (job_mem_limit + sys_mem_limit - 1) / sys_mem_limit; if ((job_desc_msg->pn_min_cpus == NO_VAL16) || (job_desc_msg->pn_min_cpus < min_cpus)) { debug("Setting job's pn_min_cpus to %u due to memory " "limit", min_cpus); job_desc_msg->pn_min_cpus = min_cpus; cpus_per_node = MAX(cpus_per_node, min_cpus); } sys_mem_limit *= cpus_per_node; } if (job_mem_limit <= sys_mem_limit) return true; debug2("JobId=%u mem=%"PRIu64"M > MaxMemPer%s=%"PRIu64"M in partition:%s", job_desc_msg->job_id, job_mem_limit, (job_mem_limit & MEM_PER_CPU) ? "CPU" : "Node", sys_mem_limit, (part_ptr && part_ptr->name) ? part_ptr->name : "N/A"); return false; } /* * Validate TRES specification of the form: * "name=[type:]#[,[type:]#][;name=[type:]#]" * For example: "gpu:kepler:2,craynetwork=1" */ extern bool valid_tres_cnt(char *tres) { char *end_ptr = NULL, *colon, *save_ptr = NULL, *sep, *tok, *tmp; bool rc = true; long long int val; if (!tres || (tres[0] == '\0')) return true; tmp = xstrdup(tres); tok = strtok_r(tmp, ",", &save_ptr); while (tok) { bool valid_name = false; sep = strchr(tok, ':'); if (sep) { sep[0] = '\0'; sep++; } if (valid_tres_name(tok)) valid_name = true; if (!sep) { /* No model or count. Implicit count of 1 */ if (!valid_name) { rc = false; break; } } else if ((colon = strchr(sep, ':'))) { /* Includes explicit "name:type:count" */ sep = colon + 1; /* Points to count */ val = strtoll(sep, &end_ptr, 10); /* First only check numeric component for validity */ if (((val < 0) || (val == LLONG_MAX)) || (!valid_name && (val != 0))) { rc = false; break; } /* * Now check that any count suffic is valid. */ if (suffix_mult(end_ptr) == NO_VAL64) { rc = false; break; } } else { /* * Includes "name:type" or "name:count" * Since we don't know if there is a count, * we can not do more now. */ } tok = strtok_r(NULL, ",", &save_ptr); } xfree(tmp); return rc; } /* * Validate the named TRES is valid for scheduling parameters. * Returns FALSE if the name is invalid or the GRES count is zero. */ extern bool valid_tres_name(char *name) { if (!name || (name[0] == '\0')) return false; if (gres_get_system_cnt(name) != NO_VAL64) return true; return false; } /* * Increment time limit of one job record for node configuraiton. */ static void _job_time_limit_incr(job_record_t *job_ptr, uint32_t boot_job_id) { time_t delta_t, now = time(NULL); delta_t = difftime(now, job_ptr->start_time); if ((job_ptr->job_id != boot_job_id) && !IS_JOB_CONFIGURING(job_ptr)) job_ptr->tot_sus_time = delta_t; if ((job_ptr->time_limit != INFINITE) && ((job_ptr->job_id == boot_job_id) || (delta_t != 0))) { if (delta_t && !IS_JOB_CONFIGURING(job_ptr)) { verbose("Extending %pJ time limit by %u secs for configuration", job_ptr, (uint32_t) delta_t); } job_ptr->end_time = now + (job_ptr->time_limit * 60); job_ptr->end_time_exp = job_ptr->end_time; } } /* * Increment time limit for all components of a hetjob for node configuraiton. * job_ptr IN - pointer to job record for which configuration is complete * boot_job_id - job ID of record with newly powered up node or 0 */ static void _het_job_time_limit_incr(job_record_t *job_ptr, uint32_t boot_job_id) { job_record_t *het_job_leader, *het_job; ListIterator iter; if (!job_ptr->het_job_id) { _job_time_limit_incr(job_ptr, boot_job_id); return; } het_job_leader = find_job_record(job_ptr->het_job_id); if (!het_job_leader) { error("%s: Hetjob leader %pJ not found", __func__, job_ptr); _job_time_limit_incr(job_ptr, boot_job_id); return; } if (!het_job_leader->het_job_list) { error("%s: Hetjob leader %pJ job list is NULL", __func__, job_ptr); _job_time_limit_incr(job_ptr, boot_job_id); return; } iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) { _job_time_limit_incr(het_job, boot_job_id); } list_iterator_destroy(iter); } /* Clear job's CONFIGURING flag and advance end time as needed */ extern void job_config_fini(job_record_t *job_ptr) { time_t now = time(NULL); last_job_update = now; job_ptr->job_state &= ~JOB_CONFIGURING; if (IS_JOB_POWER_UP_NODE(job_ptr)) { info("Resetting %pJ start time for node power up", job_ptr); job_ptr->job_state &= ~JOB_POWER_UP_NODE; job_ptr->start_time = now; _het_job_time_limit_incr(job_ptr, job_ptr->job_id); jobacct_storage_g_job_start(acct_db_conn, job_ptr); } else { _het_job_time_limit_incr(job_ptr, 0); } /* * Request asynchronous launch of a prolog for a non-batch job. * PROLOG_FLAG_CONTAIN also turns on PROLOG_FLAG_ALLOC. */ if (slurmctld_conf.prolog_flags & PROLOG_FLAG_ALLOC) launch_prolog(job_ptr); } /* * Determine of the nodes are ready to run a job * RET true if ready */ extern bool test_job_nodes_ready(job_record_t *job_ptr) { if (IS_JOB_PENDING(job_ptr)) return false; if (!job_ptr->node_bitmap) /* Revoked allocation */ return true; if (bit_overlap_any(job_ptr->node_bitmap, power_node_bitmap)) return false; if (!job_ptr->batch_flag || job_ptr->batch_features || job_ptr->wait_all_nodes || job_ptr->burst_buffer) { /* Make sure all nodes ready to start job */ if ((select_g_job_ready(job_ptr) & READY_NODE_STATE) == 0) return false; } else if (job_ptr->batch_flag) { #ifdef HAVE_FRONT_END /* Make sure frontend node is ready to start batch job */ front_end_record_t *front_end_ptr = find_front_end_record(job_ptr->batch_host); if (!front_end_ptr || IS_NODE_POWER_SAVE(front_end_ptr) || IS_NODE_POWER_UP(front_end_ptr)) { return false; } #else /* Make sure first node is ready to start batch job */ node_record_t *node_ptr = find_node_record(job_ptr->batch_host); if (!node_ptr || IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr)) { return false; } #endif } return true; } /* * For non-hetjob, return true if this job is configuring. * For hetjob, return true if any component of the job is configuring. */ static bool _het_job_configuring_test(job_record_t *job_ptr) { job_record_t *het_job_leader, *het_job; ListIterator iter; bool result = false; if (IS_JOB_CONFIGURING(job_ptr)) return true; if (!job_ptr->het_job_id) return false; het_job_leader = find_job_record(job_ptr->het_job_id); if (!het_job_leader) { error("%s: Hetjob leader %pJ not found", __func__, job_ptr); return false; } if (!het_job_leader->het_job_list) { error("%s: Hetjob leader %pJ job list is NULL", __func__, job_ptr); return false; } iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) { if (IS_JOB_CONFIGURING(het_job)) { result = true; break; } } list_iterator_destroy(iter); return result; } /* * job_time_limit - terminate jobs which have exceeded their time limit * global: job_list - pointer global job list * last_job_update - time of last job table update */ void job_time_limit(void) { ListIterator job_iterator; job_record_t *job_ptr; time_t now = time(NULL); time_t old = now - ((slurmctld_conf.inactive_limit * 4 / 3) + slurmctld_conf.msg_timeout + 1); time_t over_run; uint16_t over_time_limit; uint8_t prolog; int job_test_count = 0; uint32_t resv_over_run = slurmctld_conf.resv_over_run; xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); if (resv_over_run == INFINITE16) resv_over_run = YEAR_SECONDS; else resv_over_run *= 60; /* * locks same as in _slurmctld_background() (The only current place this * is called). */ slurmctld_lock_t job_write_lock = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; DEF_TIMERS; job_iterator = list_iterator_create(job_list); START_TIMER; while ((job_ptr = list_next(job_iterator))) { xassert (job_ptr->magic == JOB_MAGIC); job_test_count++; if (job_ptr->details) prolog = job_ptr->details->prolog_running; else prolog = 0; if ((prolog == 0) && IS_JOB_CONFIGURING(job_ptr) && test_job_nodes_ready(job_ptr)) { info("%s: Configuration for %pJ complete", __func__, job_ptr); job_config_fini(job_ptr); if (job_ptr->bit_flags & NODE_REBOOT) { job_ptr->bit_flags &= (~NODE_REBOOT); if (job_ptr->batch_flag) launch_job(job_ptr); } } /* * Features have been changed on some node, make job eligiable * to run and test to see if it can run now */ if (node_features_updated && (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) && IS_JOB_PENDING(job_ptr) && (job_ptr->priority == 0)) { job_ptr->state_reason = WAIT_NO_REASON; set_job_prio(job_ptr); last_job_update = now; } /* Don't enforce time limits for configuring hetjobs */ if (_het_job_configuring_test(job_ptr)) continue; /* * Only running jobs can be killed due to timeout. Do not kill * suspended jobs due to timeout. */ if (!IS_JOB_RUNNING(job_ptr)) continue; /* * everything above here is considered "quick", and skips the * timeout at the bottom of the loop by using a continue. * everything below is considered "slow", and needs to jump to * time_check before the next job is tested */ if (job_ptr->preempt_time) { (void)slurm_job_preempt(job_ptr, NULL, slurm_job_preempt_mode(job_ptr), false); goto time_check; } if (slurmctld_conf.inactive_limit && (job_ptr->batch_flag == 0) && (job_ptr->time_last_active <= old) && (job_ptr->other_port) && (job_ptr->part_ptr) && (!(job_ptr->part_ptr->flags & PART_FLAG_ROOT_ONLY))) { /* job inactive, kill it */ info("%s: inactivity time limit reached for %pJ", __func__, job_ptr); _job_timed_out(job_ptr, false); job_ptr->state_reason = FAIL_INACTIVE_LIMIT; xfree(job_ptr->state_desc); goto time_check; } if (job_ptr->time_limit != INFINITE) { send_job_warn_signal(job_ptr, false); if ((job_ptr->mail_type & MAIL_JOB_TIME100) && (now >= job_ptr->end_time)) { job_ptr->mail_type &= (~MAIL_JOB_TIME100); mail_job_info(job_ptr, MAIL_JOB_TIME100); } if ((job_ptr->mail_type & MAIL_JOB_TIME90) && (now + (job_ptr->time_limit * 60 * 0.1) >= job_ptr->end_time)) { job_ptr->mail_type &= (~MAIL_JOB_TIME90); mail_job_info(job_ptr, MAIL_JOB_TIME90); } if ((job_ptr->mail_type & MAIL_JOB_TIME80) && (now + (job_ptr->time_limit * 60 * 0.2) >= job_ptr->end_time)) { job_ptr->mail_type &= (~MAIL_JOB_TIME80); mail_job_info(job_ptr, MAIL_JOB_TIME80); } if ((job_ptr->mail_type & MAIL_JOB_TIME50) && (now + (job_ptr->time_limit * 60 * 0.5) >= job_ptr->end_time)) { job_ptr->mail_type &= (~MAIL_JOB_TIME50); mail_job_info(job_ptr, MAIL_JOB_TIME50); } if (job_ptr->part_ptr && (job_ptr->part_ptr->over_time_limit != NO_VAL16)) { over_time_limit = job_ptr->part_ptr->over_time_limit; } else { over_time_limit = slurmctld_conf.over_time_limit; } if (over_time_limit == INFINITE16) over_run = now - YEAR_SECONDS; else over_run = now - (over_time_limit * 60); if (job_ptr->end_time <= over_run) { last_job_update = now; info("Time limit exhausted for %pJ", job_ptr); _job_timed_out(job_ptr, false); job_ptr->state_reason = FAIL_TIMEOUT; xfree(job_ptr->state_desc); goto time_check; } } if (job_ptr->resv_ptr && !(job_ptr->resv_ptr->flags & RESERVE_FLAG_FLEX) && (job_ptr->resv_ptr->end_time + resv_over_run) < time(NULL)){ last_job_update = now; info("Reservation ended for %pJ", job_ptr); _job_timed_out(job_ptr, false); job_ptr->state_reason = FAIL_TIMEOUT; xfree(job_ptr->state_desc); goto time_check; } /* * check if any individual job steps have exceeded * their time limit */ if (job_ptr->step_list && (list_count(job_ptr->step_list) > 0)) check_job_step_time_limit(job_ptr, now); acct_policy_job_time_out(job_ptr); if (job_ptr->state_reason == FAIL_TIMEOUT) { last_job_update = now; _job_timed_out(job_ptr, false); xfree(job_ptr->state_desc); goto time_check; } /* Give srun command warning message about pending timeout */ if (job_ptr->end_time <= (now + PERIODIC_TIMEOUT * 2)) srun_timeout (job_ptr); /* * _job_timed_out() and other calls can take a long time on * some platforms. This loop is holding the job_write lock; * if a lot of jobs need to be timed out within the same cycle * this stalls other threads from running and causes * communication issues within the cluster. * * This test happens last, as job_ptr may be pointing to a job * that would be deleted by a separate thread when the job_write * lock is released. However, list_next itself is thread safe, * and can be used again once the locks are reacquired. * list_peek_next is used in the unlikely event the timer has * expired just as the end of the job_list is reached. */ time_check: /* Use a hard-coded 3 second timeout, with a 1 second sleep. */ if (slurm_delta_tv(&tv1) >= 3000000 && list_peek_next(job_iterator) ) { END_TIMER; debug("%s: yielding locks after testing" " %d jobs, %s", __func__, job_test_count, TIME_STR); unlock_slurmctld(job_write_lock); usleep(1000000); lock_slurmctld(job_write_lock); START_TIMER; job_test_count = 0; } } list_iterator_destroy(job_iterator); node_features_updated = false; } extern void job_set_req_tres(job_record_t *job_ptr, bool assoc_mgr_locked) { uint32_t cpu_cnt = 0, node_cnt = 0; uint64_t mem_cnt = 0; assoc_mgr_lock_t locks = { .tres = READ_LOCK }; xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); xfree(job_ptr->tres_req_str); xfree(job_ptr->tres_fmt_req_str); xfree(job_ptr->tres_req_cnt); if (!assoc_mgr_locked) assoc_mgr_lock(&locks); job_ptr->tres_req_cnt = xcalloc(g_tres_count, sizeof(uint64_t)); if (job_ptr->details) { node_cnt = job_ptr->details->min_nodes; cpu_cnt = job_ptr->details->min_cpus; if (job_ptr->details->pn_min_memory) mem_cnt = job_ptr->details->pn_min_memory; } /* if this is set just override */ if (job_ptr->total_cpus) cpu_cnt = job_ptr->total_cpus; if (job_ptr->node_cnt) node_cnt = job_ptr->node_cnt; job_ptr->tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)node_cnt; job_ptr->tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)cpu_cnt; job_ptr->tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( job_ptr->job_resrcs, mem_cnt, cpu_cnt, node_cnt); license_set_job_tres_cnt(job_ptr->license_list, job_ptr->tres_req_cnt, true); /* FIXME: this assumes that all nodes have equal TRES */ gres_set_job_tres_cnt(job_ptr->gres_list, node_cnt, job_ptr->tres_req_cnt, true); bb_g_job_set_tres_cnt(job_ptr, job_ptr->tres_req_cnt, true); /* * Do this last as it calculates off of everything else. * Don't use calc_job_billable_tres() as it relies on allocated tres * If the partition was destroyed the part_ptr will be NULL. As this * could be run on already finished jobs running in the assoc mgr * cache. */ if (job_ptr->part_ptr) job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] = assoc_mgr_tres_weighted( job_ptr->tres_req_cnt, job_ptr->part_ptr->billing_weights, slurmctld_conf.priority_flags, true); /* now that the array is filled lets make the string from it */ set_job_tres_req_str(job_ptr, true); if (!assoc_mgr_locked) assoc_mgr_unlock(&locks); } extern void job_set_alloc_tres(job_record_t *job_ptr, bool assoc_mgr_locked) { uint32_t alloc_nodes = 0; assoc_mgr_lock_t locks = { .tres = READ_LOCK }; xfree(job_ptr->tres_alloc_str); xfree(job_ptr->tres_alloc_cnt); xfree(job_ptr->tres_fmt_alloc_str); /* * We only need to do this on non-pending jobs. * Requeued jobs are marked as PENDING|COMPLETING until the epilog is * finished so we still need the alloc tres until then. */ if (IS_JOB_PENDING(job_ptr) && !IS_JOB_COMPLETING(job_ptr)) return; if (!assoc_mgr_locked) assoc_mgr_lock(&locks); job_ptr->tres_alloc_cnt = xcalloc(slurmctld_tres_cnt, sizeof(uint64_t)); job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU] = (uint64_t)job_ptr->total_cpus; alloc_nodes = job_ptr->node_cnt; job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE] = (uint64_t)alloc_nodes; job_ptr->tres_alloc_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( job_ptr->job_resrcs, job_ptr->details->pn_min_memory, job_ptr->tres_alloc_cnt[TRES_ARRAY_CPU], job_ptr->tres_alloc_cnt[TRES_ARRAY_NODE]); job_ptr->tres_alloc_cnt[TRES_ARRAY_ENERGY] = NO_VAL64; license_set_job_tres_cnt(job_ptr->license_list, job_ptr->tres_alloc_cnt, true); gres_set_job_tres_cnt(job_ptr->gres_list, alloc_nodes, job_ptr->tres_alloc_cnt, true); bb_g_job_set_tres_cnt(job_ptr, job_ptr->tres_alloc_cnt, true); /* Do this last as it calculates off of everything else. */ job_ptr->tres_alloc_cnt[TRES_ARRAY_BILLING] = calc_job_billable_tres(job_ptr, job_ptr->start_time, true); /* now that the array is filled lets make the string from it */ set_job_tres_alloc_str(job_ptr, true); if (!assoc_mgr_locked) assoc_mgr_unlock(&locks); return; } /* * job_update_tres_cnt - when job is completing remove allocated tres * from count. * IN/OUT job_ptr - job structure to be updated * IN node_inx - node bit that is finished with job. * RET SLURM_SUCCES on success SLURM_ERROR on cpu_cnt underflow */ extern int job_update_tres_cnt(job_record_t *job_ptr, int node_inx) { int cpu_cnt, offset = -1, rc = SLURM_SUCCESS; xassert(job_ptr); if (job_ptr->details->whole_node == 1) { /* * Since we are allocating whole nodes don't rely on * the job_resrcs since it could be less because the * node could of only used 1 thread per core. */ node_record_t *node_ptr = node_record_table_ptr + node_inx; cpu_cnt = node_ptr->config_ptr->cpus; } else { if ((offset = job_resources_node_inx_to_cpu_inx( job_ptr->job_resrcs, node_inx)) < 0) { error("%s: problem getting offset of %pJ", __func__, job_ptr); job_ptr->cpu_cnt = 0; return SLURM_ERROR; } cpu_cnt = job_ptr->job_resrcs->cpus[offset]; } if (cpu_cnt > job_ptr->cpu_cnt) { error("%s: cpu_cnt underflow (%d > %u) on %pJ", __func__, cpu_cnt, job_ptr->cpu_cnt, job_ptr); job_ptr->cpu_cnt = 0; rc = SLURM_ERROR; } else job_ptr->cpu_cnt -= cpu_cnt; if (IS_JOB_RESIZING(job_ptr)) { if (cpu_cnt > job_ptr->total_cpus) { error("%s: total_cpus underflow on %pJ", __func__, job_ptr); job_ptr->total_cpus = 0; rc = SLURM_ERROR; } else job_ptr->total_cpus -= cpu_cnt; job_set_alloc_tres(job_ptr, false); } return rc; } /* Terminate a job that has exhausted its time limit */ static void _job_timed_out(job_record_t *job_ptr, bool preempted) { xassert(job_ptr); srun_timeout(job_ptr); if (job_ptr->details) { time_t now = time(NULL); job_ptr->end_time = now; job_ptr->time_last_active = now; if (!job_ptr->preempt_time) job_ptr->job_state = JOB_TIMEOUT | JOB_COMPLETING; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, false); deallocate_nodes(job_ptr, !preempted, false, preempted); } else job_signal(job_ptr, SIGKILL, 0, 0, false); return; } /* _validate_job_desc - validate that a job descriptor for job submit or * allocate has valid data, set values to defaults as required * IN/OUT job_desc_msg - pointer to job descriptor, modified as needed * IN allocate - if clear job to be queued, if set allocate for user now * IN submit_uid - who request originated */ static int _validate_job_desc(job_desc_msg_t *job_desc_msg, int allocate, uid_t submit_uid, part_record_t *part_ptr, List part_list) { if ((job_desc_msg->min_cpus == NO_VAL) && (job_desc_msg->min_nodes == NO_VAL) && (job_desc_msg->req_nodes == NULL)) { info("%s: job specified no min_cpus, min_nodes or req_nodes", __func__); return ESLURM_JOB_MISSING_SIZE_SPECIFICATION; } if ((allocate == SLURM_CREATE_JOB_FLAG_NO_ALLOCATE_0) && (job_desc_msg->script == NULL)) { info("%s: job failed to specify Script", __func__); return ESLURM_JOB_SCRIPT_MISSING; } if (job_desc_msg->script && job_desc_msg->x11) { info("%s: batch job cannot use X11 forwarding", __func__); return ESLURM_X11_NOT_AVAIL; } if (job_desc_msg->user_id == NO_VAL) { info("%s: job failed to specify User", __func__); return ESLURM_USER_ID_MISSING; } if ( job_desc_msg->group_id == NO_VAL ) { debug("%s: job failed to specify group", __func__); return ESLURM_GROUP_ID_MISSING; } if (job_desc_msg->contiguous == NO_VAL16) job_desc_msg->contiguous = 0; if (job_desc_msg->task_dist == NO_VAL) { /* not typically set by salloc or sbatch */ job_desc_msg->task_dist = SLURM_DIST_CYCLIC; } if (job_desc_msg->plane_size == NO_VAL16) job_desc_msg->plane_size = 0; if (job_desc_msg->kill_on_node_fail == NO_VAL16) job_desc_msg->kill_on_node_fail = 1; if (job_desc_msg->job_id != NO_VAL) { job_record_t *dup_job_ptr; if (!fed_mgr_fed_rec && (submit_uid != 0) && (submit_uid != slurmctld_conf.slurm_user_id)) { info("attempt by uid %u to set JobId=%u", submit_uid, job_desc_msg->job_id); return ESLURM_INVALID_JOB_ID; } if (job_desc_msg->job_id == 0) { info("attempt by uid %u to set JobId=0", submit_uid); return ESLURM_INVALID_JOB_ID; } dup_job_ptr = find_job_record(job_desc_msg->job_id); if (dup_job_ptr) { info("attempt to re-use active %pJ", dup_job_ptr); return ESLURM_DUPLICATE_JOB_ID; } } if (job_desc_msg->nice == NO_VAL) job_desc_msg->nice = NICE_OFFSET; if (job_desc_msg->pn_min_memory == NO_VAL64) { /* Default memory limit is DefMemPerCPU (if set) or no limit */ if (part_ptr && part_ptr->def_mem_per_cpu) { job_desc_msg->pn_min_memory = part_ptr->def_mem_per_cpu; } else { job_desc_msg->pn_min_memory = slurmctld_conf.def_mem_per_cpu; } } else if (!_validate_min_mem_partition(job_desc_msg, part_ptr, part_list)) { return ESLURM_INVALID_TASK_MEMORY; } else { /* Memory limit explicity set by user */ job_desc_msg->bitflags |= JOB_MEM_SET; } if (job_desc_msg->pn_min_memory == MEM_PER_CPU) { /* Map --mem-per-cpu=0 to --mem=0 for simpler logic */ job_desc_msg->pn_min_memory = 0; } /* Validate a job's accounting frequency, if specified */ if (acct_gather_check_acct_freq_task( job_desc_msg->pn_min_memory, job_desc_msg->acctg_freq)) return ESLURMD_INVALID_ACCT_FREQ; if (job_desc_msg->min_nodes == NO_VAL) job_desc_msg->min_nodes = 1; /* default node count of 1 */ if (job_desc_msg->min_cpus == NO_VAL) job_desc_msg->min_cpus = job_desc_msg->min_nodes; if ((job_desc_msg->pn_min_cpus == NO_VAL16) || (job_desc_msg->pn_min_cpus == 0)) job_desc_msg->pn_min_cpus = 1; /* default 1 cpu per node */ if (job_desc_msg->pn_min_tmp_disk == NO_VAL) job_desc_msg->pn_min_tmp_disk = 0;/* default 0MB disk per node */ return SLURM_SUCCESS; } /* * Traverse the list of partitions and invoke the * function validating the job memory specification. */ static bool _validate_min_mem_partition(job_desc_msg_t *job_desc_msg, part_record_t *part_ptr, List part_list) { ListIterator iter; part_record_t *part; uint64_t tmp_pn_min_memory; uint16_t tmp_cpus_per_task; uint32_t tmp_min_cpus; uint32_t tmp_max_cpus; uint32_t tmp_pn_min_cpus; bool cc = false; /* no reason to check them here as we aren't enforcing them */ if (!slurmctld_conf.enforce_part_limits) return true; tmp_pn_min_memory = job_desc_msg->pn_min_memory; tmp_cpus_per_task = job_desc_msg->cpus_per_task; tmp_min_cpus = job_desc_msg->min_cpus; tmp_max_cpus = job_desc_msg->max_cpus; tmp_pn_min_cpus = job_desc_msg->pn_min_cpus; if (part_list == NULL) { cc = _valid_pn_min_mem(job_desc_msg, part_ptr); } else { iter = list_iterator_create(part_list); while ((part = list_next(iter))) { cc = _valid_pn_min_mem(job_desc_msg, part); /* for ALL we have to test them all */ if (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ALL) { if (!cc) break; } else if (cc) /* break, we found one! */ break; else if (slurmctld_conf.enforce_part_limits == PARTITION_ENFORCE_ANY) { debug("%s: Job requested for (%"PRIu64")MB is invalid" " for partition %s", __func__, job_desc_msg->pn_min_memory, part->name); } job_desc_msg->pn_min_memory = tmp_pn_min_memory; job_desc_msg->cpus_per_task = tmp_cpus_per_task; job_desc_msg->min_cpus = tmp_min_cpus; job_desc_msg->max_cpus = tmp_max_cpus; job_desc_msg->pn_min_cpus = tmp_pn_min_cpus; } list_iterator_destroy(iter); } /* * Restoring original values, if it is necessary, * these will be modified in job_limits_check() */ job_desc_msg->pn_min_memory = tmp_pn_min_memory; job_desc_msg->cpus_per_task = tmp_cpus_per_task; job_desc_msg->min_cpus = tmp_min_cpus; job_desc_msg->max_cpus = tmp_max_cpus; job_desc_msg->pn_min_cpus = tmp_pn_min_cpus; return cc; } extern void free_null_array_recs(job_record_t *job_ptr) { if (!job_ptr || !job_ptr->array_recs) return; FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap); xfree(job_ptr->array_recs->task_id_str); xfree(job_ptr->array_recs); } static void _delete_job_common(job_record_t *job_ptr) { /* Remove record from fed_job_list */ fed_mgr_remove_fed_job_info(job_ptr->job_id); /* Remove the record from job hash table */ _remove_job_hash(job_ptr, JOB_HASH_JOB); /* Remove the record from job array hash tables, if applicable */ if (job_ptr->array_task_id != NO_VAL) { _remove_job_hash(job_ptr, JOB_HASH_ARRAY_JOB); _remove_job_hash(job_ptr, JOB_HASH_ARRAY_TASK); } } /* * _list_delete_job - delete a job record and its corresponding job_details, * see common/list.h for documentation * IN job_entry - pointer to job_record to delete */ static void _list_delete_job(void *job_entry) { job_record_t *job_ptr = (job_record_t *) job_entry; int job_array_size, i; xassert(job_entry); xassert (job_ptr->magic == JOB_MAGIC); job_ptr->magic = 0; /* make sure we don't delete record twice */ _delete_job_common(job_ptr); if (job_ptr->array_recs) { job_array_size = MAX(1, job_ptr->array_recs->task_cnt); } else { job_array_size = 1; } _delete_job_details(job_ptr); xfree(job_ptr->account); xfree(job_ptr->admin_comment); xfree(job_ptr->alias_list); xfree(job_ptr->alloc_node); free_null_array_recs(job_ptr); if (job_ptr->array_recs) { FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap); xfree(job_ptr->array_recs->task_id_str); xfree(job_ptr->array_recs); } xfree(job_ptr->batch_features); xfree(job_ptr->batch_host); xfree(job_ptr->burst_buffer); xfree(job_ptr->comment); xfree(job_ptr->clusters); xfree(job_ptr->cpus_per_tres); free_job_fed_details(&job_ptr->fed_details); free_job_resources(&job_ptr->job_resrcs); xfree(job_ptr->gres_alloc); _clear_job_gres_details(job_ptr); xfree(job_ptr->gres_req); xfree(job_ptr->gres_used); FREE_NULL_LIST(job_ptr->gres_list); xfree(job_ptr->licenses); FREE_NULL_LIST(job_ptr->license_list); xfree(job_ptr->limit_set.tres); xfree(job_ptr->mail_user); xfree(job_ptr->mcs_label); xfree(job_ptr->mem_per_tres); xfree(job_ptr->name); xfree(job_ptr->network); xfree(job_ptr->node_addr); FREE_NULL_BITMAP(job_ptr->node_bitmap); FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); xfree(job_ptr->nodes); xfree(job_ptr->nodes_completing); xfree(job_ptr->origin_cluster); if (job_ptr->het_details && job_ptr->het_job_id) { /* xfree struct if hetjob leader and NULL ptr otherwise. */ if (job_ptr->het_job_offset == 0) xfree(job_ptr->het_details); else job_ptr->het_details = NULL; } xfree(job_ptr->het_job_id_set); FREE_NULL_LIST(job_ptr->het_job_list); xfree(job_ptr->partition); FREE_NULL_LIST(job_ptr->part_ptr_list); xfree(job_ptr->priority_array); slurm_destroy_priority_factors_object(job_ptr->prio_factors); xfree(job_ptr->resp_host); xfree(job_ptr->resv_name); xfree(job_ptr->sched_nodes); for (i = 0; i < job_ptr->spank_job_env_size; i++) xfree(job_ptr->spank_job_env[i]); xfree(job_ptr->spank_job_env); xfree(job_ptr->state_desc); xfree(job_ptr->system_comment); xfree(job_ptr->tres_alloc_cnt); xfree(job_ptr->tres_alloc_str); xfree(job_ptr->tres_bind); xfree(job_ptr->tres_freq); xfree(job_ptr->tres_fmt_alloc_str); xfree(job_ptr->tres_per_job); xfree(job_ptr->tres_per_node); xfree(job_ptr->tres_per_socket); xfree(job_ptr->tres_per_task); xfree(job_ptr->tres_req_cnt); xfree(job_ptr->tres_req_str); xfree(job_ptr->tres_fmt_req_str); step_list_purge(job_ptr); select_g_select_jobinfo_free(job_ptr->select_jobinfo); xfree(job_ptr->user_name); xfree(job_ptr->wckey); if (job_array_size > job_count) { error("job_count underflow"); job_count = 0; } else { job_count -= job_array_size; } job_ptr->job_id = 0; xfree(job_ptr); } /* * find specific job_id entry in the job list, key is job_id_ptr */ static int _list_find_job_id(void *job_entry, void *key) { job_record_t *job_ptr = (job_record_t *) job_entry; uint32_t *job_id_ptr = (uint32_t *) key; if (job_ptr->job_id == *job_id_ptr) return 1; return 0; } /* * _list_find_job_old - find old entries in the job list, * see common/list.h for documentation, key is ignored * job_entry IN - job pointer * key IN - if not NULL, then skip hetjobs */ static int _list_find_job_old(void *job_entry, void *key) { time_t kill_age, min_age, now = time(NULL); job_record_t *job_ptr = (job_record_t *) job_entry; uint16_t cleaning = 0; if ((job_ptr->job_id == NO_VAL) && IS_JOB_REVOKED(job_ptr)) return 1; if (key && job_ptr->het_job_id) return 0; if (IS_JOB_COMPLETING(job_ptr) && !LOTS_OF_AGENTS) { kill_age = now - (slurmctld_conf.kill_wait + 2 * slurm_get_msg_timeout()); if (job_ptr->time_last_active < kill_age) { job_ptr->time_last_active = now; re_kill_job(job_ptr); } return 0; /* Job still completing */ } if (job_ptr->epilog_running) return 0; /* EpilogSlurmctld still running */ if (slurmctld_conf.min_job_age == 0) return 0; /* No job record purging */ if (fed_mgr_fed_rec && job_ptr->fed_details && !fed_mgr_is_origin_job(job_ptr)) { uint32_t origin_id = fed_mgr_get_cluster_id(job_ptr->job_id); slurmdb_cluster_rec_t *origin = fed_mgr_get_cluster_by_id(origin_id); /* keep job around until origin comes back and is synced */ if (origin && (!origin->fed.send || (((slurm_persist_conn_t *)origin->fed.send)->fd == -1) || !origin->fed.sync_sent)) return 0; } min_age = now - slurmctld_conf.min_job_age; if (job_ptr->end_time > min_age) return 0; /* Too new to purge */ if (!(IS_JOB_COMPLETED(job_ptr))) return 0; /* Job still active */ if (job_ptr->step_list && list_count(job_ptr->step_list)) { debug("%pJ still has %d active steps", job_ptr, list_count(job_ptr->step_list)); /* * If the job has been around more than 30 days the steps are * bogus. Blow the job away. This was witnessed <= 16.05 but * hasn't be seen since. This is here just to clear them out if * this ever shows up again. */ min_age = now - PURGE_OLD_JOB_IN_SEC; if (job_ptr->end_time <= min_age) { info("Force purge of %pJ. It ended over 30 days ago, the slurmctld thinks there are still steps running but they are most likely bogus. In any case you might want to check nodes %s to make sure nothing remains of the job.", job_ptr, job_ptr->nodes); goto end_it; } else return 0; /* steps are still active */ } if (job_ptr->array_recs) { if (job_ptr->array_recs->tot_run_tasks || !_test_job_array_purged(job_ptr->array_job_id)) { /* Some tasks from this job array still active */ return 0; } } select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_CLEANING, &cleaning); if (cleaning) return 0; /* Job hasn't finished yet */ if (bb_g_job_test_stage_out(job_ptr) != 1) return 0; /* Stage out in progress */ /* If we don't have a db_index by now and we are running with * the slurmdbd, lets put it on the list to be handled later * when slurmdbd comes back up since we won't get another chance. * job_start won't pend for job_db_inx when the job is finished. */ end_it: if (with_slurmdbd && !job_ptr->db_index) jobacct_storage_g_job_start(acct_db_conn, job_ptr); return 1; /* Purge the job */ } /* Determine if ALL partitions associated with a job are hidden */ static bool _all_parts_hidden(job_record_t *job_ptr, uid_t uid) { bool rc; ListIterator part_iterator; part_record_t *part_ptr; if (job_ptr->part_ptr_list) { rc = true; part_iterator = list_iterator_create(job_ptr->part_ptr_list); while ((part_ptr = list_next(part_iterator))) { if (part_is_visible(part_ptr, uid)) { rc = false; break; } } list_iterator_destroy(part_iterator); return rc; } if (job_ptr->part_ptr && part_is_visible(job_ptr->part_ptr, uid)) return false; return true; } /* Determine if a given job should be seen by a specific user */ static bool _hide_job(job_record_t *job_ptr, uid_t uid, uint16_t show_flags) { if (!(show_flags & SHOW_ALL) && IS_JOB_REVOKED(job_ptr)) return true; if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) && (job_ptr->user_id != uid) && !validate_operator(uid) && (((slurm_mcs_get_privatedata() == 0) && !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) || ((slurm_mcs_get_privatedata() == 1) && (mcs_g_check_mcs_label(uid, job_ptr->mcs_label) != 0)))) return true; return false; } static void _pack_job(job_record_t *job_ptr, _foreach_pack_job_info_t *pack_info) { xassert (job_ptr->magic == JOB_MAGIC); if ((pack_info->filter_uid != NO_VAL) && (pack_info->filter_uid != job_ptr->user_id)) return; if (((pack_info->show_flags & SHOW_ALL) == 0) && (pack_info->uid != 0) && _all_parts_hidden(job_ptr, pack_info->uid)) return; if (_hide_job(job_ptr, pack_info->uid, pack_info->show_flags)) return; pack_job(job_ptr, pack_info->show_flags, pack_info->buffer, pack_info->protocol_version, pack_info->uid); (*pack_info->jobs_packed)++; } static int _foreach_pack_jobid(void *object, void *arg) { job_record_t *job_ptr; uint32_t job_id = *(uint32_t *)object; _foreach_pack_job_info_t *info = (_foreach_pack_job_info_t *)arg; if (!(job_ptr = find_job_record(job_id))) return SLURM_SUCCESS; _pack_job(job_ptr, info); return SLURM_SUCCESS; } /* * pack_all_jobs - dump all job information for all jobs in * machine independent form (for network transmission) * OUT buffer_ptr - the pointer is set to the allocated buffer. * OUT buffer_size - set to size of the buffer in bytes * IN show_flags - job filtering options * IN uid - uid of user making request (for partition filtering) * IN filter_uid - pack only jobs belonging to this user if not NO_VAL * global: job_list - global list of job records * NOTE: the buffer at *buffer_ptr must be xfreed by the caller * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c * whenever the data format changes */ extern void pack_all_jobs(char **buffer_ptr, int *buffer_size, uint16_t show_flags, uid_t uid, uint32_t filter_uid, uint16_t protocol_version) { uint32_t jobs_packed = 0, tmp_offset; _foreach_pack_job_info_t pack_info = {0}; Buf buffer; ListIterator itr; job_record_t *job_ptr = NULL; buffer_ptr[0] = NULL; *buffer_size = 0; buffer = init_buf(BUF_SIZE); /* write message body header : size and time */ /* put in a place holder job record count of 0 for now */ pack32(jobs_packed, buffer); pack_time(time(NULL), buffer); /* write individual job records */ pack_info.buffer = buffer; pack_info.filter_uid = filter_uid; pack_info.jobs_packed = &jobs_packed; pack_info.protocol_version = protocol_version; pack_info.show_flags = show_flags; pack_info.uid = uid; itr = list_iterator_create(job_list); while ((job_ptr = list_next(itr))) { _pack_job(job_ptr, &pack_info); } list_iterator_destroy(itr); /* put the real record count in the message body header */ tmp_offset = get_buf_offset(buffer); set_buf_offset(buffer, 0); pack32(jobs_packed, buffer); set_buf_offset(buffer, tmp_offset); *buffer_size = get_buf_offset(buffer); buffer_ptr[0] = xfer_buf_data(buffer); } /* * pack_spec_jobs - dump job information for specified jobs in * machine independent form (for network transmission) * OUT buffer_ptr - the pointer is set to the allocated buffer. * OUT buffer_size - set to size of the buffer in bytes * IN show_flags - job filtering options * IN job_ids - list of job_ids to pack * IN uid - uid of user making request (for partition filtering) * IN filter_uid - pack only jobs belonging to this user if not NO_VAL * global: job_list - global list of job records * NOTE: the buffer at *buffer_ptr must be xfreed by the caller * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c * whenever the data format changes */ extern void pack_spec_jobs(char **buffer_ptr, int *buffer_size, List job_ids, uint16_t show_flags, uid_t uid, uint32_t filter_uid, uint16_t protocol_version) { uint32_t jobs_packed = 0, tmp_offset; _foreach_pack_job_info_t pack_info = {0}; Buf buffer; xassert(job_ids); buffer_ptr[0] = NULL; *buffer_size = 0; buffer = init_buf(BUF_SIZE); /* write message body header : size and time */ /* put in a place holder job record count of 0 for now */ pack32(jobs_packed, buffer); pack_time(time(NULL), buffer); /* write individual job records */ pack_info.buffer = buffer; pack_info.filter_uid = filter_uid; pack_info.jobs_packed = &jobs_packed; pack_info.protocol_version = protocol_version; pack_info.show_flags = show_flags; pack_info.uid = uid; list_for_each(job_ids, _foreach_pack_jobid, &pack_info); /* put the real record count in the message body header */ tmp_offset = get_buf_offset(buffer); set_buf_offset(buffer, 0); pack32(jobs_packed, buffer); set_buf_offset(buffer, tmp_offset); *buffer_size = get_buf_offset(buffer); buffer_ptr[0] = xfer_buf_data(buffer); } static int _pack_het_job(job_record_t *job_ptr, uint16_t show_flags, Buf buffer, uint16_t protocol_version, uid_t uid) { job_record_t *het_job_ptr; int job_cnt = 0; ListIterator iter; iter = list_iterator_create(job_ptr->het_job_list); while ((het_job_ptr = list_next(iter))) { if (het_job_ptr->het_job_id == job_ptr->het_job_id) { pack_job(het_job_ptr, show_flags, buffer, protocol_version, uid); job_cnt++; } else { error("%s: Bad het_job_list for %pJ", __func__, job_ptr); } } list_iterator_destroy(iter); return job_cnt; } /* * pack_one_job - dump information for one jobs in * machine independent form (for network transmission) * OUT buffer_ptr - the pointer is set to the allocated buffer. * OUT buffer_size - set to size of the buffer in bytes * IN job_id - ID of job that we want info for * IN show_flags - job filtering options * IN uid - uid of user making request (for partition filtering) * NOTE: the buffer at *buffer_ptr must be xfreed by the caller * NOTE: change _unpack_job_desc_msg() in common/slurm_protocol_pack.c * whenever the data format changes */ extern int pack_one_job(char **buffer_ptr, int *buffer_size, uint32_t job_id, uint16_t show_flags, uid_t uid, uint16_t protocol_version) { job_record_t *job_ptr; uint32_t jobs_packed = 0, tmp_offset; Buf buffer; buffer_ptr[0] = NULL; *buffer_size = 0; buffer = init_buf(BUF_SIZE); /* write message body header : size and time */ /* put in a place holder job record count of 0 for now */ pack32(jobs_packed, buffer); pack_time(time(NULL), buffer); job_ptr = find_job_record(job_id); if (job_ptr && job_ptr->het_job_list) { /* Pack heterogeneous job components */ if (!_hide_job(job_ptr, uid, show_flags)) { jobs_packed = _pack_het_job(job_ptr, show_flags, buffer, protocol_version, uid); } } else if (job_ptr && (job_ptr->array_task_id == NO_VAL) && !job_ptr->array_recs) { /* Pack regular (not array) job */ if (!_hide_job(job_ptr, uid, show_flags)) { pack_job(job_ptr, show_flags, buffer, protocol_version, uid); jobs_packed++; } } else { bool packed_head = false; /* Either the job is not found or it is a job array */ if (job_ptr) { packed_head = true; if (!_hide_job(job_ptr, uid, show_flags)) { pack_job(job_ptr, show_flags, buffer, protocol_version, uid); jobs_packed++; } } job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; while (job_ptr) { if ((job_ptr->job_id == job_id) && packed_head) { ; /* Already packed */ } else if (job_ptr->array_job_id == job_id) { if (_hide_job(job_ptr, uid, show_flags)) break; pack_job(job_ptr, show_flags, buffer, protocol_version, uid); jobs_packed++; } job_ptr = job_ptr->job_array_next_j; } } if (jobs_packed == 0) { free_buf(buffer); return ESLURM_INVALID_JOB_ID; } /* put the real record count in the message body header */ tmp_offset = get_buf_offset(buffer); set_buf_offset(buffer, 0); pack32(jobs_packed, buffer); set_buf_offset(buffer, tmp_offset); *buffer_size = get_buf_offset(buffer); buffer_ptr[0] = xfer_buf_data(buffer); return SLURM_SUCCESS; } static void _pack_job_gres(job_record_t *dump_job_ptr, Buf buffer, uint16_t protocol_version) { if (!IS_JOB_STARTED(dump_job_ptr) || IS_JOB_FINISHED(dump_job_ptr) || (dump_job_ptr->gres_list == NULL)) { packstr_array(NULL, 0, buffer); return; } packstr_array(dump_job_ptr->gres_detail_str, dump_job_ptr->gres_detail_cnt, buffer); } /* * pack_job - dump all configuration information about a specific job in * machine independent form (for network transmission) * IN dump_job_ptr - pointer to job for which information is requested * IN show_flags - job filtering options * IN/OUT buffer - buffer in which data is placed, pointers automatically * updated * IN uid - user requesting the data * NOTE: change _unpack_job_info_members() in common/slurm_protocol_pack.c * whenever the data format changes */ void pack_job(job_record_t *dump_job_ptr, uint16_t show_flags, Buf buffer, uint16_t protocol_version, uid_t uid) { struct job_details *detail_ptr; time_t accrue_time = 0, begin_time = 0, start_time = 0, end_time = 0; uint32_t time_limit; char *nodelist = NULL; assoc_mgr_lock_t locks = { .qos = READ_LOCK }; if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) { detail_ptr = dump_job_ptr->details; pack32(dump_job_ptr->array_job_id, buffer); pack32(dump_job_ptr->array_task_id, buffer); if (dump_job_ptr->array_recs) { build_array_str(dump_job_ptr); packstr(dump_job_ptr->array_recs->task_id_str, buffer); pack32(dump_job_ptr->array_recs->max_run_tasks, buffer); } else { job_record_t *array_head = NULL; packnull(buffer); if (dump_job_ptr->array_job_id) { array_head = find_job_record( dump_job_ptr->array_job_id); } if (array_head && array_head->array_recs) { pack32(array_head->array_recs->max_run_tasks, buffer); } else { pack32((uint32_t) 0, buffer); } } pack32(dump_job_ptr->assoc_id, buffer); pack32(dump_job_ptr->delay_boot, buffer); pack32(dump_job_ptr->job_id, buffer); pack32(dump_job_ptr->user_id, buffer); pack32(dump_job_ptr->group_id, buffer); pack32(dump_job_ptr->het_job_id, buffer); packstr(dump_job_ptr->het_job_id_set, buffer); pack32(dump_job_ptr->het_job_offset, buffer); pack32(dump_job_ptr->profile, buffer); pack32(dump_job_ptr->job_state, buffer); pack16(dump_job_ptr->batch_flag, buffer); pack16(dump_job_ptr->state_reason, buffer); pack8(dump_job_ptr->power_flags, buffer); pack8(dump_job_ptr->reboot, buffer); pack16(dump_job_ptr->restart_cnt, buffer); pack16(show_flags, buffer); pack_time(dump_job_ptr->deadline, buffer); pack32(dump_job_ptr->alloc_sid, buffer); if ((dump_job_ptr->time_limit == NO_VAL) && dump_job_ptr->part_ptr) time_limit = dump_job_ptr->part_ptr->max_time; else time_limit = dump_job_ptr->time_limit; pack32(time_limit, buffer); pack32(dump_job_ptr->time_min, buffer); if (dump_job_ptr->details) { pack32(dump_job_ptr->details->nice, buffer); pack_time(dump_job_ptr->details->submit_time, buffer); /* Earliest possible begin time */ begin_time = dump_job_ptr->details->begin_time; /* When we started accruing time for priority */ accrue_time = dump_job_ptr->details->accrue_time; } else { /* Some job details may be purged after completion */ pack32(NICE_OFFSET, buffer); /* Best guess */ pack_time((time_t) 0, buffer); } pack_time(begin_time, buffer); pack_time(accrue_time, buffer); if (IS_JOB_STARTED(dump_job_ptr)) { /* Report actual start time, in past */ start_time = dump_job_ptr->start_time; end_time = dump_job_ptr->end_time; } else if (dump_job_ptr->start_time != 0) { /* Report expected start time, * making sure that time is not in the past */ start_time = MAX(dump_job_ptr->start_time, time(NULL)); if (time_limit != NO_VAL) { end_time = MAX(dump_job_ptr->end_time, (start_time + time_limit * 60)); } } else if (begin_time > time(NULL)) { /* earliest start time in the future */ start_time = begin_time; if (time_limit != NO_VAL) { end_time = MAX(dump_job_ptr->end_time, (start_time + time_limit * 60)); } } pack_time(start_time, buffer); pack_time(end_time, buffer); pack_time(dump_job_ptr->suspend_time, buffer); pack_time(dump_job_ptr->pre_sus_time, buffer); pack_time(dump_job_ptr->resize_time, buffer); pack_time(dump_job_ptr->last_sched_eval, buffer); pack_time(dump_job_ptr->preempt_time, buffer); pack32(dump_job_ptr->priority, buffer); packdouble(dump_job_ptr->billable_tres, buffer); packstr(slurmctld_conf.cluster_name, buffer); /* Only send the allocated nodelist since we are only sending * the number of cpus and nodes that are currently allocated. */ if (!IS_JOB_COMPLETING(dump_job_ptr)) packstr(dump_job_ptr->nodes, buffer); else { nodelist = bitmap2node_name(dump_job_ptr->node_bitmap_cg); packstr(nodelist, buffer); xfree(nodelist); } packstr(dump_job_ptr->sched_nodes, buffer); if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) packstr(dump_job_ptr->part_ptr->name, buffer); else packstr(dump_job_ptr->partition, buffer); packstr(dump_job_ptr->account, buffer); packstr(dump_job_ptr->admin_comment, buffer); pack32(dump_job_ptr->site_factor, buffer); packstr(dump_job_ptr->network, buffer); packstr(dump_job_ptr->comment, buffer); packstr(dump_job_ptr->batch_features, buffer); packstr(dump_job_ptr->batch_host, buffer); packstr(dump_job_ptr->burst_buffer, buffer); packstr(dump_job_ptr->burst_buffer_state, buffer); packstr(dump_job_ptr->system_comment, buffer); assoc_mgr_lock(&locks); if (dump_job_ptr->qos_ptr) packstr(dump_job_ptr->qos_ptr->name, buffer); else { if (assoc_mgr_qos_list) { packstr(slurmdb_qos_str(assoc_mgr_qos_list, dump_job_ptr->qos_id), buffer); } else packnull(buffer); } if (IS_JOB_STARTED(dump_job_ptr) && (slurmctld_conf.preempt_mode != PREEMPT_MODE_OFF) && (slurm_job_preempt_mode(dump_job_ptr) != PREEMPT_MODE_OFF)) { time_t preemptable = acct_policy_get_preemptable_time( dump_job_ptr); pack_time(preemptable, buffer); } else { pack_time(0, buffer); } assoc_mgr_unlock(&locks); packstr(dump_job_ptr->licenses, buffer); packstr(dump_job_ptr->state_desc, buffer); packstr(dump_job_ptr->resv_name, buffer); packstr(dump_job_ptr->mcs_label, buffer); pack32(dump_job_ptr->exit_code, buffer); pack32(dump_job_ptr->derived_ec, buffer); packstr(dump_job_ptr->gres_used, buffer); if (show_flags & SHOW_DETAIL) { pack_job_resources(dump_job_ptr->job_resrcs, buffer, protocol_version); _pack_job_gres(dump_job_ptr, buffer, protocol_version); } else { pack32(NO_VAL, buffer); pack32((uint32_t) 0, buffer); } packstr(dump_job_ptr->name, buffer); packstr(dump_job_ptr->user_name, buffer); packstr(dump_job_ptr->wckey, buffer); pack32(dump_job_ptr->req_switch, buffer); pack32(dump_job_ptr->wait4switch, buffer); packstr(dump_job_ptr->alloc_node, buffer); if (!IS_JOB_COMPLETING(dump_job_ptr)) pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer); else pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer); select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, buffer, protocol_version); /* A few details are always dumped here */ _pack_default_job_details(dump_job_ptr, buffer, protocol_version); /* other job details are only dumped until the job starts * running (at which time they become meaningless) */ if (detail_ptr) _pack_pending_job_details(detail_ptr, buffer, protocol_version); else _pack_pending_job_details(NULL, buffer, protocol_version); pack32(dump_job_ptr->bit_flags, buffer); packstr(dump_job_ptr->tres_fmt_alloc_str, buffer); packstr(dump_job_ptr->tres_fmt_req_str, buffer); pack16(dump_job_ptr->start_protocol_ver, buffer); if (dump_job_ptr->fed_details) { packstr(dump_job_ptr->fed_details->origin_str, buffer); pack64(dump_job_ptr->fed_details->siblings_active, buffer); packstr(dump_job_ptr->fed_details->siblings_active_str, buffer); pack64(dump_job_ptr->fed_details->siblings_viable, buffer); packstr(dump_job_ptr->fed_details->siblings_viable_str, buffer); } else { packnull(buffer); pack64((uint64_t)0, buffer); packnull(buffer); pack64((uint64_t)0, buffer); packnull(buffer); } packstr(dump_job_ptr->cpus_per_tres, buffer); packstr(dump_job_ptr->mem_per_tres, buffer); packstr(dump_job_ptr->tres_bind, buffer); packstr(dump_job_ptr->tres_freq, buffer); packstr(dump_job_ptr->tres_per_job, buffer); packstr(dump_job_ptr->tres_per_node, buffer); packstr(dump_job_ptr->tres_per_socket, buffer); packstr(dump_job_ptr->tres_per_task, buffer); pack16(dump_job_ptr->mail_type, buffer); packstr(dump_job_ptr->mail_user, buffer); } else if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) { detail_ptr = dump_job_ptr->details; pack32(dump_job_ptr->array_job_id, buffer); pack32(dump_job_ptr->array_task_id, buffer); if (dump_job_ptr->array_recs) { build_array_str(dump_job_ptr); packstr(dump_job_ptr->array_recs->task_id_str, buffer); pack32(dump_job_ptr->array_recs->max_run_tasks, buffer); } else { job_record_t *array_head = NULL; packnull(buffer); if (dump_job_ptr->array_job_id) { array_head = find_job_record( dump_job_ptr->array_job_id); } if (array_head && array_head->array_recs) { pack32(array_head->array_recs->max_run_tasks, buffer); } else { pack32((uint32_t) 0, buffer); } } pack32(dump_job_ptr->assoc_id, buffer); pack32(dump_job_ptr->delay_boot, buffer); pack32(dump_job_ptr->job_id, buffer); pack32(dump_job_ptr->user_id, buffer); pack32(dump_job_ptr->group_id, buffer); pack32(dump_job_ptr->het_job_id, buffer); packstr(dump_job_ptr->het_job_id_set, buffer); pack32(dump_job_ptr->het_job_offset, buffer); pack32(dump_job_ptr->profile, buffer); pack32(dump_job_ptr->job_state, buffer); pack16(dump_job_ptr->batch_flag, buffer); pack16(dump_job_ptr->state_reason, buffer); pack8(dump_job_ptr->power_flags, buffer); pack8(dump_job_ptr->reboot, buffer); pack16(dump_job_ptr->restart_cnt, buffer); pack16(show_flags, buffer); pack_time(dump_job_ptr->deadline, buffer); pack32(dump_job_ptr->alloc_sid, buffer); if ((dump_job_ptr->time_limit == NO_VAL) && dump_job_ptr->part_ptr) time_limit = dump_job_ptr->part_ptr->max_time; else time_limit = dump_job_ptr->time_limit; pack32(time_limit, buffer); pack32(dump_job_ptr->time_min, buffer); if (dump_job_ptr->details) { pack32(dump_job_ptr->details->nice, buffer); pack_time(dump_job_ptr->details->submit_time, buffer); /* Earliest possible begin time */ begin_time = dump_job_ptr->details->begin_time; /* When we started accruing time for priority */ accrue_time = dump_job_ptr->details->accrue_time; } else { /* Some job details may be purged after completion */ pack32(NICE_OFFSET, buffer); /* Best guess */ pack_time((time_t) 0, buffer); } pack_time(begin_time, buffer); pack_time(accrue_time, buffer); if (IS_JOB_STARTED(dump_job_ptr)) { /* Report actual start time, in past */ start_time = dump_job_ptr->start_time; end_time = dump_job_ptr->end_time; } else if (dump_job_ptr->start_time != 0) { /* Report expected start time, * making sure that time is not in the past */ start_time = MAX(dump_job_ptr->start_time, time(NULL)); if (time_limit != NO_VAL) { end_time = MAX(dump_job_ptr->end_time, (start_time + time_limit * 60)); } } else if (begin_time > time(NULL)) { /* earliest start time in the future */ start_time = begin_time; if (time_limit != NO_VAL) { end_time = MAX(dump_job_ptr->end_time, (start_time + time_limit * 60)); } } pack_time(start_time, buffer); pack_time(end_time, buffer); pack_time(dump_job_ptr->suspend_time, buffer); pack_time(dump_job_ptr->pre_sus_time, buffer); pack_time(dump_job_ptr->resize_time, buffer); pack_time(dump_job_ptr->last_sched_eval, buffer); pack_time(dump_job_ptr->preempt_time, buffer); pack32(dump_job_ptr->priority, buffer); packdouble(dump_job_ptr->billable_tres, buffer); packstr(slurmctld_conf.cluster_name, buffer); /* Only send the allocated nodelist since we are only sending * the number of cpus and nodes that are currently allocated. */ if (!IS_JOB_COMPLETING(dump_job_ptr)) packstr(dump_job_ptr->nodes, buffer); else { nodelist = bitmap2node_name(dump_job_ptr->node_bitmap_cg); packstr(nodelist, buffer); xfree(nodelist); } packstr(dump_job_ptr->sched_nodes, buffer); if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) packstr(dump_job_ptr->part_ptr->name, buffer); else packstr(dump_job_ptr->partition, buffer); packstr(dump_job_ptr->account, buffer); packstr(dump_job_ptr->admin_comment, buffer); pack32(dump_job_ptr->site_factor, buffer); packstr(dump_job_ptr->network, buffer); packstr(dump_job_ptr->comment, buffer); packstr(dump_job_ptr->batch_features, buffer); packstr(dump_job_ptr->batch_host, buffer); packstr(dump_job_ptr->burst_buffer, buffer); packstr(dump_job_ptr->burst_buffer_state, buffer); packstr(dump_job_ptr->system_comment, buffer); assoc_mgr_lock(&locks); if (dump_job_ptr->qos_ptr) packstr(dump_job_ptr->qos_ptr->name, buffer); else { if (assoc_mgr_qos_list) { packstr(slurmdb_qos_str(assoc_mgr_qos_list, dump_job_ptr->qos_id), buffer); } else packnull(buffer); } if (IS_JOB_STARTED(dump_job_ptr) && (slurmctld_conf.preempt_mode != PREEMPT_MODE_OFF) && (slurm_job_preempt_mode(dump_job_ptr) != PREEMPT_MODE_OFF)) { time_t preemptable = acct_policy_get_preemptable_time( dump_job_ptr); pack_time(preemptable, buffer); } else { pack_time(0, buffer); } assoc_mgr_unlock(&locks); packstr(dump_job_ptr->licenses, buffer); packstr(dump_job_ptr->state_desc, buffer); packstr(dump_job_ptr->resv_name, buffer); packstr(dump_job_ptr->mcs_label, buffer); pack32(dump_job_ptr->exit_code, buffer); pack32(dump_job_ptr->derived_ec, buffer); if (show_flags & SHOW_DETAIL) { pack_job_resources(dump_job_ptr->job_resrcs, buffer, protocol_version); _pack_job_gres(dump_job_ptr, buffer, protocol_version); } else { pack32(NO_VAL, buffer); pack32((uint32_t) 0, buffer); } packstr(dump_job_ptr->name, buffer); packstr(dump_job_ptr->user_name, buffer); packstr(dump_job_ptr->wckey, buffer); pack32(dump_job_ptr->req_switch, buffer); pack32(dump_job_ptr->wait4switch, buffer); packstr(dump_job_ptr->alloc_node, buffer); if (!IS_JOB_COMPLETING(dump_job_ptr)) pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer); else pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer); select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, buffer, protocol_version); /* A few details are always dumped here */ _pack_default_job_details(dump_job_ptr, buffer, protocol_version); /* other job details are only dumped until the job starts * running (at which time they become meaningless) */ if (detail_ptr) _pack_pending_job_details(detail_ptr, buffer, protocol_version); else _pack_pending_job_details(NULL, buffer, protocol_version); pack32(dump_job_ptr->bit_flags, buffer); packstr(dump_job_ptr->tres_fmt_alloc_str, buffer); packstr(dump_job_ptr->tres_fmt_req_str, buffer); pack16(dump_job_ptr->start_protocol_ver, buffer); if (dump_job_ptr->fed_details) { packstr(dump_job_ptr->fed_details->origin_str, buffer); pack64(dump_job_ptr->fed_details->siblings_active, buffer); packstr(dump_job_ptr->fed_details->siblings_active_str, buffer); pack64(dump_job_ptr->fed_details->siblings_viable, buffer); packstr(dump_job_ptr->fed_details->siblings_viable_str, buffer); } else { packnull(buffer); pack64((uint64_t)0, buffer); packnull(buffer); pack64((uint64_t)0, buffer); packnull(buffer); } packstr(dump_job_ptr->cpus_per_tres, buffer); packstr(dump_job_ptr->mem_per_tres, buffer); packstr(dump_job_ptr->tres_bind, buffer); packstr(dump_job_ptr->tres_freq, buffer); packstr(dump_job_ptr->tres_per_job, buffer); packstr(dump_job_ptr->tres_per_node, buffer); packstr(dump_job_ptr->tres_per_socket, buffer); packstr(dump_job_ptr->tres_per_task, buffer); } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { detail_ptr = dump_job_ptr->details; pack32(dump_job_ptr->array_job_id, buffer); pack32(dump_job_ptr->array_task_id, buffer); if (dump_job_ptr->array_recs) { build_array_str(dump_job_ptr); packstr(dump_job_ptr->array_recs->task_id_str, buffer); pack32(dump_job_ptr->array_recs->max_run_tasks, buffer); } else { packnull(buffer); pack32((uint32_t) 0, buffer); } pack32(dump_job_ptr->assoc_id, buffer); pack32(dump_job_ptr->delay_boot, buffer); pack32(dump_job_ptr->job_id, buffer); pack32(dump_job_ptr->user_id, buffer); pack32(dump_job_ptr->group_id, buffer); pack32(dump_job_ptr->het_job_id, buffer); packstr(dump_job_ptr->het_job_id_set, buffer); pack32(dump_job_ptr->het_job_offset, buffer); pack32(dump_job_ptr->profile, buffer); pack32(dump_job_ptr->job_state, buffer); pack16(dump_job_ptr->batch_flag, buffer); pack16(dump_job_ptr->state_reason, buffer); pack8(dump_job_ptr->power_flags, buffer); pack8(dump_job_ptr->reboot, buffer); pack16(dump_job_ptr->restart_cnt, buffer); pack16(show_flags, buffer); pack_time(dump_job_ptr->deadline, buffer); pack32(dump_job_ptr->alloc_sid, buffer); if ((dump_job_ptr->time_limit == NO_VAL) && dump_job_ptr->part_ptr) time_limit = dump_job_ptr->part_ptr->max_time; else time_limit = dump_job_ptr->time_limit; pack32(time_limit, buffer); pack32(dump_job_ptr->time_min, buffer); if (dump_job_ptr->details) { pack32(dump_job_ptr->details->nice, buffer); pack_time(dump_job_ptr->details->submit_time, buffer); /* Earliest possible begin time */ begin_time = dump_job_ptr->details->begin_time; /* When we started accruing time for priority */ accrue_time = dump_job_ptr->details->accrue_time; } else { /* Some job details may be purged after completion */ pack32(NICE_OFFSET, buffer); /* Best guess */ pack_time((time_t) 0, buffer); } pack_time(begin_time, buffer); pack_time(accrue_time, buffer); if (IS_JOB_STARTED(dump_job_ptr)) { /* Report actual start time, in past */ start_time = dump_job_ptr->start_time; end_time = dump_job_ptr->end_time; } else if (dump_job_ptr->start_time != 0) { /* Report expected start time, * making sure that time is not in the past */ start_time = MAX(dump_job_ptr->start_time, time(NULL)); if (time_limit != NO_VAL) { end_time = MAX(dump_job_ptr->end_time, (start_time + time_limit * 60)); } } else if (begin_time > time(NULL)) { /* earliest start time in the future */ start_time = begin_time; if (time_limit != NO_VAL) { end_time = MAX(dump_job_ptr->end_time, (start_time + time_limit * 60)); } } pack_time(start_time, buffer); pack_time(end_time, buffer); pack_time(dump_job_ptr->suspend_time, buffer); pack_time(dump_job_ptr->pre_sus_time, buffer); pack_time(dump_job_ptr->resize_time, buffer); pack_time(dump_job_ptr->last_sched_eval, buffer); pack_time(dump_job_ptr->preempt_time, buffer); pack32(dump_job_ptr->priority, buffer); packdouble(dump_job_ptr->billable_tres, buffer); packstr(slurmctld_conf.cluster_name, buffer); /* Only send the allocated nodelist since we are only sending * the number of cpus and nodes that are currently allocated. */ if (!IS_JOB_COMPLETING(dump_job_ptr)) packstr(dump_job_ptr->nodes, buffer); else { nodelist = bitmap2node_name(dump_job_ptr->node_bitmap_cg); packstr(nodelist, buffer); xfree(nodelist); } packstr(dump_job_ptr->sched_nodes, buffer); if (!IS_JOB_PENDING(dump_job_ptr) && dump_job_ptr->part_ptr) packstr(dump_job_ptr->part_ptr->name, buffer); else packstr(dump_job_ptr->partition, buffer); packstr(dump_job_ptr->account, buffer); packstr(dump_job_ptr->admin_comment, buffer); packstr(dump_job_ptr->network, buffer); packstr(dump_job_ptr->comment, buffer); packstr(dump_job_ptr->batch_features, buffer); packstr(dump_job_ptr->batch_host, buffer); packstr(dump_job_ptr->burst_buffer, buffer); packstr(dump_job_ptr->burst_buffer_state, buffer); packstr(dump_job_ptr->system_comment, buffer); assoc_mgr_lock(&locks); if (dump_job_ptr->qos_ptr) packstr(dump_job_ptr->qos_ptr->name, buffer); else { if (assoc_mgr_qos_list) { packstr(slurmdb_qos_str(assoc_mgr_qos_list, dump_job_ptr->qos_id), buffer); } else packnull(buffer); } assoc_mgr_unlock(&locks); packstr(dump_job_ptr->licenses, buffer); packstr(dump_job_ptr->state_desc, buffer); packstr(dump_job_ptr->resv_name, buffer); packstr(dump_job_ptr->mcs_label, buffer); pack32(dump_job_ptr->exit_code, buffer); pack32(dump_job_ptr->derived_ec, buffer); if (show_flags & SHOW_DETAIL) { pack_job_resources(dump_job_ptr->job_resrcs, buffer, protocol_version); _pack_job_gres(dump_job_ptr, buffer, protocol_version); } else { pack32(NO_VAL, buffer); pack32((uint32_t) 0, buffer); } packstr(dump_job_ptr->name, buffer); packstr(dump_job_ptr->user_name, buffer); packstr(dump_job_ptr->wckey, buffer); pack32(dump_job_ptr->req_switch, buffer); pack32(dump_job_ptr->wait4switch, buffer); packstr(dump_job_ptr->alloc_node, buffer); if (!IS_JOB_COMPLETING(dump_job_ptr)) pack_bit_str_hex(dump_job_ptr->node_bitmap, buffer); else pack_bit_str_hex(dump_job_ptr->node_bitmap_cg, buffer); select_g_select_jobinfo_pack(dump_job_ptr->select_jobinfo, buffer, protocol_version); /* A few details are always dumped here */ _pack_default_job_details(dump_job_ptr, buffer, protocol_version); /* other job details are only dumped until the job starts * running (at which time they become meaningless) */ if (detail_ptr) _pack_pending_job_details(detail_ptr, buffer, protocol_version); else _pack_pending_job_details(NULL, buffer, protocol_version); pack32(dump_job_ptr->bit_flags, buffer); packstr(dump_job_ptr->tres_fmt_alloc_str, buffer); packstr(dump_job_ptr->tres_fmt_req_str, buffer); pack16(dump_job_ptr->start_protocol_ver, buffer); if (dump_job_ptr->fed_details) { packstr(dump_job_ptr->fed_details->origin_str, buffer); pack64(dump_job_ptr->fed_details->siblings_active, buffer); packstr(dump_job_ptr->fed_details->siblings_active_str, buffer); pack64(dump_job_ptr->fed_details->siblings_viable, buffer); packstr(dump_job_ptr->fed_details->siblings_viable_str, buffer); } else { packnull(buffer); pack64((uint64_t)0, buffer); packnull(buffer); pack64((uint64_t)0, buffer); packnull(buffer); } packstr(dump_job_ptr->cpus_per_tres, buffer); packstr(dump_job_ptr->mem_per_tres, buffer); packstr(dump_job_ptr->tres_bind, buffer); packstr(dump_job_ptr->tres_freq, buffer); packstr(dump_job_ptr->tres_per_job, buffer); packstr(dump_job_ptr->tres_per_node, buffer); packstr(dump_job_ptr->tres_per_socket, buffer); packstr(dump_job_ptr->tres_per_task, buffer); } else { error("pack_job: protocol_version " "%hu not supported", protocol_version); } } static void _find_node_config(int *cpu_cnt_ptr, int *core_cnt_ptr) { static int max_cpu_cnt = -1, max_core_cnt = -1; int i; node_record_t *node_ptr = node_record_table_ptr; *cpu_cnt_ptr = max_cpu_cnt; *core_cnt_ptr = max_core_cnt; if (max_cpu_cnt != -1) return; for (i = 0; i < node_record_count; i++, node_ptr++) { /* Only data from config_record used for scheduling */ max_cpu_cnt = MAX(max_cpu_cnt, node_ptr->config_ptr->cpus); max_core_cnt = MAX(max_core_cnt, node_ptr->config_ptr->cores); } } /* pack default job details for "get_job_info" RPC */ static void _pack_default_job_details(job_record_t *job_ptr, Buf buffer, uint16_t protocol_version) { int max_cpu_cnt = -1, max_core_cnt = -1; int i; struct job_details *detail_ptr = job_ptr->details; uint16_t shared = 0; if (!detail_ptr) shared = NO_VAL16; else if (detail_ptr->share_res == 1) /* User --share */ shared = 1; else if ((detail_ptr->share_res == 0) || (detail_ptr->whole_node == 1)) shared = 0; /* User --exclusive */ else if (detail_ptr->whole_node == WHOLE_NODE_USER) shared = JOB_SHARED_USER; /* User --exclusive=user */ else if (detail_ptr->whole_node == WHOLE_NODE_MCS) shared = JOB_SHARED_MCS; /* User --exclusive=mcs */ else if (job_ptr->part_ptr) { /* Report shared status based upon latest partition info */ if (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER) shared = JOB_SHARED_USER; else if ((job_ptr->part_ptr->max_share & SHARED_FORCE) && ((job_ptr->part_ptr->max_share & (~SHARED_FORCE)) > 1)) shared = 1; /* Partition Shared=force */ else if (job_ptr->part_ptr->max_share == 0) shared = 0; /* Partition Shared=exclusive */ else shared = NO_VAL16; /* Part Shared=yes or no */ } else shared = NO_VAL16; /* No user or partition info */ if (job_ptr->part_ptr && job_ptr->part_ptr->max_cpu_cnt) { max_cpu_cnt = job_ptr->part_ptr->max_cpu_cnt; max_core_cnt = job_ptr->part_ptr->max_core_cnt; } else _find_node_config(&max_cpu_cnt, &max_core_cnt); if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { if (detail_ptr) { packstr(detail_ptr->features, buffer); packstr(detail_ptr->cluster_features, buffer); packstr(detail_ptr->work_dir, buffer); packstr(detail_ptr->dependency, buffer); if (detail_ptr->argv) { char *cmd_line = NULL, *pos = NULL; for (i = 0; detail_ptr->argv[i]; i++) { xstrfmtcatat(cmd_line, &pos, "%s%s", (i ? " " : ""), detail_ptr->argv[i]); } packstr(cmd_line, buffer); xfree(cmd_line); } else packnull(buffer); if (IS_JOB_COMPLETING(job_ptr) && job_ptr->cpu_cnt) { pack32(job_ptr->cpu_cnt, buffer); pack32((uint32_t) 0, buffer); } else if (job_ptr->total_cpus && !IS_JOB_PENDING(job_ptr)) { /* If job is PENDING ignore total_cpus, * which may have been set by previous run * followed by job requeue. */ pack32(job_ptr->total_cpus, buffer); pack32((uint32_t) 0, buffer); } else { pack32(detail_ptr->min_cpus, buffer); if (detail_ptr->max_cpus != NO_VAL) pack32(detail_ptr->max_cpus, buffer); else pack32((uint32_t) 0, buffer); } if (IS_JOB_COMPLETING(job_ptr) && job_ptr->node_cnt) { pack32(job_ptr->node_cnt, buffer); pack32((uint32_t) 0, buffer); } else if (job_ptr->total_nodes) { pack32(job_ptr->total_nodes, buffer); pack32((uint32_t) 0, buffer); } else if (job_ptr->node_cnt_wag) { /* This should catch everything else, but * just in case this is 0 (startup or * whatever) we will keep the rest of * this if statement around. */ pack32(job_ptr->node_cnt_wag, buffer); pack32((uint32_t) detail_ptr->max_nodes, buffer); } else if (detail_ptr->ntasks_per_node) { /* min_nodes based upon task count and ntasks * per node */ uint32_t min_nodes; min_nodes = detail_ptr->num_tasks / detail_ptr->ntasks_per_node; min_nodes = MAX(min_nodes, detail_ptr->min_nodes); pack32(min_nodes, buffer); pack32(detail_ptr->max_nodes, buffer); } else if (detail_ptr->cpus_per_task > 1) { /* min_nodes based upon task count and cpus * per task */ uint32_t ntasks_per_node, min_nodes; ntasks_per_node = max_cpu_cnt / detail_ptr->cpus_per_task; ntasks_per_node = MAX(ntasks_per_node, 1); min_nodes = detail_ptr->num_tasks / ntasks_per_node; min_nodes = MAX(min_nodes, detail_ptr->min_nodes); if (detail_ptr->num_tasks % ntasks_per_node) min_nodes++; pack32(min_nodes, buffer); pack32(detail_ptr->max_nodes, buffer); } else if (detail_ptr->mc_ptr && detail_ptr->mc_ptr->ntasks_per_core && (detail_ptr->mc_ptr->ntasks_per_core != INFINITE16)) { /* min_nodes based upon task count and ntasks * per core */ uint32_t min_cores, min_nodes; min_cores = detail_ptr->num_tasks + detail_ptr->mc_ptr->ntasks_per_core - 1; min_cores /= detail_ptr->mc_ptr->ntasks_per_core; min_nodes = min_cores + max_core_cnt - 1; min_nodes /= max_core_cnt; min_nodes = MAX(min_nodes, detail_ptr->min_nodes); pack32(min_nodes, buffer); pack32(detail_ptr->max_nodes, buffer); } else { /* min_nodes based upon task count only */ uint32_t min_nodes; min_nodes = detail_ptr->num_tasks + max_cpu_cnt - 1; min_nodes /= max_cpu_cnt; min_nodes = MAX(min_nodes, detail_ptr->min_nodes); pack32(min_nodes, buffer); pack32(detail_ptr->max_nodes, buffer); } pack16(detail_ptr->requeue, buffer); pack16(detail_ptr->ntasks_per_node, buffer); if (detail_ptr->num_tasks) pack32(detail_ptr->num_tasks, buffer); else if (IS_JOB_PENDING(job_ptr)) pack32(detail_ptr->min_nodes, buffer); else pack32(job_ptr->node_cnt, buffer); pack16(shared, buffer); pack32(detail_ptr->cpu_freq_min, buffer); pack32(detail_ptr->cpu_freq_max, buffer); pack32(detail_ptr->cpu_freq_gov, buffer); } else { packnull(buffer); packnull(buffer); packnull(buffer); packnull(buffer); if (job_ptr->total_cpus) pack32(job_ptr->total_cpus, buffer); else pack32(job_ptr->cpu_cnt, buffer); pack32((uint32_t) 0, buffer); pack32(job_ptr->node_cnt, buffer); pack32((uint32_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); pack32((uint32_t) 0, buffer); } } else { error("_pack_default_job_details: protocol_version " "%hu not supported", protocol_version); } } /* pack pending job details for "get_job_info" RPC */ static void _pack_pending_job_details(struct job_details *detail_ptr, Buf buffer, uint16_t protocol_version) { if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { if (detail_ptr) { pack16(detail_ptr->contiguous, buffer); pack16(detail_ptr->core_spec, buffer); pack16(detail_ptr->cpus_per_task, buffer); pack16(detail_ptr->pn_min_cpus, buffer); pack64(detail_ptr->pn_min_memory, buffer); pack32(detail_ptr->pn_min_tmp_disk, buffer); packstr(detail_ptr->req_nodes, buffer); pack_bit_str_hex(detail_ptr->req_node_bitmap, buffer); packstr(detail_ptr->exc_nodes, buffer); pack_bit_str_hex(detail_ptr->exc_node_bitmap, buffer); packstr(detail_ptr->std_err, buffer); packstr(detail_ptr->std_in, buffer); packstr(detail_ptr->std_out, buffer); pack_multi_core_data(detail_ptr->mc_ptr, buffer, protocol_version); } else { pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack16((uint16_t) 0, buffer); pack64((uint64_t) 0, buffer); pack32((uint32_t) 0, buffer); packnull(buffer); packnull(buffer); packnull(buffer); packnull(buffer); packnull(buffer); packnull(buffer); packnull(buffer); pack_multi_core_data(NULL, buffer, protocol_version); } } else { error("%s: protocol_version %hu not supported", __func__, protocol_version); } } static int _purge_het_job_filter(void *x, void *key) { job_record_t *job_ptr = (job_record_t *) x; job_record_t *job_filter = (job_record_t *) key; if (job_ptr->het_job_id == job_filter->het_job_id) return 1; return 0; } /* If this is a hetjob leader and all components are complete, * then purge all job of its hetjob records * RET true if this record purged */ static inline bool _purge_complete_het_job(job_record_t *het_job_leader) { job_record_t purge_job_rec; job_record_t *het_job; ListIterator iter; bool incomplete_job = false; int i; if (!het_job_leader->het_job_list) return false; /* Not hetjob leader */ if (!IS_JOB_FINISHED(het_job_leader)) return false; /* Hetjob leader incomplete */ iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) { if (het_job_leader->het_job_id != het_job->het_job_id) { error("%s: Bad het_job_list for %pJ", __func__, het_job_leader); continue; } if (!_list_find_job_old(het_job, NULL)) { incomplete_job = true; break; } } list_iterator_destroy(iter); if (incomplete_job) return false; purge_job_rec.het_job_id = het_job_leader->het_job_id; i = list_delete_all(job_list, &_purge_het_job_filter, &purge_job_rec); if (i) { debug2("%s: purged %d old job records", __func__, i); last_job_update = time(NULL); slurm_mutex_lock(&purge_thread_lock); slurm_cond_signal(&purge_thread_cond); slurm_mutex_unlock(&purge_thread_lock); } return true; } /* * If the job or slurm.conf requests to not kill on invalid dependency, * then set the job state reason to WAIT_DEP_INVALID. Otherwise, kill the * job. */ void handle_invalid_dependency(job_record_t *job_ptr) { job_ptr->state_reason = WAIT_DEP_INVALID; xfree(job_ptr->state_desc); if (job_ptr->bit_flags & KILL_INV_DEP) { _kill_dependent(job_ptr); } else if (job_ptr->bit_flags & NO_KILL_INV_DEP) { debug("%s: %pJ job dependency never satisfied", __func__, job_ptr); } else if (kill_invalid_dep) { _kill_dependent(job_ptr); } else { debug("%s: %pJ job dependency never satisfied", __func__, job_ptr); job_ptr->state_reason = WAIT_DEP_INVALID; } fed_mgr_remove_remote_dependencies(job_ptr); } /* * purge_old_job - purge old job records. * The jobs must have completed at least MIN_JOB_AGE minutes ago. * Test job dependencies, handle after_ok, after_not_ok before * purging any jobs. */ void purge_old_job(void) { ListIterator job_iterator; job_record_t *job_ptr; int i, purge_job_count; xassert(verify_lock(CONF_LOCK, READ_LOCK)); xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); xassert(verify_lock(NODE_LOCK, WRITE_LOCK)); xassert(verify_lock(FED_LOCK, READ_LOCK)); if ((purge_job_count = list_count(purge_files_list))) debug("%s: job file deletion is falling behind, " "%d left to remove", __func__, purge_job_count); job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { if (_purge_complete_het_job(job_ptr)) continue; if (!IS_JOB_PENDING(job_ptr)) continue; /* * If the dependency is already invalid there's no reason to * keep checking it. */ if (job_ptr->state_reason == WAIT_DEP_INVALID) continue; if (test_job_dependency(job_ptr, NULL) == FAIL_DEPEND) { /* Check what are the job disposition * to deal with invalid dependecies */ handle_invalid_dependency(job_ptr); } } list_iterator_destroy(job_iterator); fed_mgr_test_remote_dependencies(); i = list_delete_all(job_list, &_list_find_job_old, ""); if (i) { debug2("purge_old_job: purged %d old job records", i); last_job_update = time(NULL); slurm_mutex_lock(&purge_thread_lock); slurm_cond_signal(&purge_thread_cond); slurm_mutex_unlock(&purge_thread_lock); } } /* * purge_job_record - purge specific job record. No testing is performed to * ensure the job records has no active references. Use only for job * records that were never fully operational (e.g. WILL_RUN test, failed * job load, failed job create, etc.). * IN job_id - job_id of job record to be purged * RET int - count of job's purged * global: job_list - global job table */ extern int purge_job_record(uint32_t job_id) { int count = 0; count = list_delete_all(job_list, _list_find_job_id, (void *)&job_id); if (count) { last_job_update = time(NULL); slurm_mutex_lock(&purge_thread_lock); slurm_cond_signal(&purge_thread_cond); slurm_mutex_unlock(&purge_thread_lock); } return count; } extern void unlink_job_record(job_record_t *job_ptr) { uint32_t *job_id; xassert(job_ptr->magic == JOB_MAGIC); _delete_job_common(job_ptr); job_id = xmalloc(sizeof(uint32_t)); *job_id = job_ptr->job_id; list_enqueue(purge_files_list, job_id); job_ptr->job_id = NO_VAL; last_job_update = time(NULL); slurm_mutex_lock(&purge_thread_lock); slurm_cond_signal(&purge_thread_cond); slurm_mutex_unlock(&purge_thread_lock); } /* * reset_job_bitmaps - reestablish bitmaps for existing jobs. * this should be called after rebuilding node information, * but before using any job entries. * global: last_job_update - time of last job table update * job_list - pointer to global job list */ void reset_job_bitmaps(void) { ListIterator job_iterator; job_record_t *job_ptr; part_record_t *part_ptr; List part_ptr_list = NULL; bool job_fail = false; time_t now = time(NULL); bool gang_flag = false; static uint32_t cr_flag = NO_VAL; xassert(job_list); if (cr_flag == NO_VAL) { cr_flag = 0; /* call is no-op for select/linear and others */ if (select_g_get_info_from_plugin(SELECT_CR_PLUGIN, NULL, &cr_flag)) { cr_flag = NO_VAL; /* error */ } } if (slurmctld_conf.preempt_mode & PREEMPT_MODE_GANG) gang_flag = true; job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { xassert (job_ptr->magic == JOB_MAGIC); job_fail = false; if (job_ptr->partition == NULL) { error("No partition for %pJ", job_ptr); part_ptr = NULL; job_fail = true; } else { char *err_part = NULL; part_ptr = find_part_record(job_ptr->partition); if (part_ptr == NULL) { part_ptr_list = get_part_list( job_ptr->partition, &err_part); if (part_ptr_list) { part_ptr = list_peek(part_ptr_list); if (list_count(part_ptr_list) == 1) FREE_NULL_LIST(part_ptr_list); } } if (part_ptr == NULL) { error("Invalid partition (%s) for %pJ", err_part, job_ptr); xfree(err_part); job_fail = true; } } job_ptr->part_ptr = part_ptr; FREE_NULL_LIST(job_ptr->part_ptr_list); if (part_ptr_list) { job_ptr->part_ptr_list = part_ptr_list; part_ptr_list = NULL; /* clear for next job */ } FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); if (job_ptr->nodes_completing && node_name2bitmap(job_ptr->nodes_completing, false, &job_ptr->node_bitmap_cg)) { error("Invalid nodes (%s) for %pJ", job_ptr->nodes_completing, job_ptr); job_fail = true; } FREE_NULL_BITMAP(job_ptr->node_bitmap); if (job_ptr->nodes && node_name2bitmap(job_ptr->nodes, false, &job_ptr->node_bitmap) && !job_fail) { error("Invalid nodes (%s) for %pJ", job_ptr->nodes, job_ptr); job_fail = true; } if (reset_node_bitmap(job_ptr)) job_fail = true; if (!job_fail && !IS_JOB_FINISHED(job_ptr) && job_ptr->job_resrcs && (cr_flag || gang_flag) && valid_job_resources(job_ptr->job_resrcs, node_record_table_ptr)) { error("Aborting %pJ due to change in socket/core configuration of allocated nodes", job_ptr); job_fail = true; } if (!job_fail && !IS_JOB_FINISHED(job_ptr) && gres_plugin_job_revalidate(job_ptr->gres_list)) { error("Aborting %pJ due to use of unsupported GRES options", job_ptr); job_fail = true; } if (!job_fail && job_ptr->job_resrcs && (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) && gres_plugin_job_revalidate2(job_ptr->job_id, job_ptr->gres_list, job_ptr->job_resrcs->node_bitmap)) { /* * This can be due to the job being allocated GRES * which no longer exist (i.e. the GRES count on some * allocated node changed since when the job started). */ error("Aborting %pJ due to use of invalid GRES configuration", job_ptr); job_fail = true; } _reset_step_bitmaps(job_ptr); /* Do not increase the job->node_cnt for completed jobs */ if (! IS_JOB_COMPLETED(job_ptr)) build_node_details(job_ptr, false); /* set node_addr */ if (_reset_detail_bitmaps(job_ptr)) job_fail = true; if (job_fail) { if (IS_JOB_PENDING(job_ptr)) { job_ptr->start_time = job_ptr->end_time = time(NULL); job_ptr->job_state = JOB_NODE_FAIL; } else if (IS_JOB_RUNNING(job_ptr)) { job_ptr->end_time = time(NULL); job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; build_cg_bitmap(job_ptr); } else if (IS_JOB_SUSPENDED(job_ptr)) { job_ptr->end_time = job_ptr->suspend_time; job_ptr->job_state = JOB_NODE_FAIL | JOB_COMPLETING; build_cg_bitmap(job_ptr); job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); } job_ptr->state_reason = FAIL_DOWN_NODE; xfree(job_ptr->state_desc); job_completion_logger(job_ptr, false); if (job_ptr->job_state == JOB_NODE_FAIL) { /* build_cg_bitmap() may clear JOB_COMPLETING */ epilog_slurmctld(job_ptr); } } } list_iterator_reset(job_iterator); /* This will reinitialize the select plugin database, which * we can only do after ALL job's states and bitmaps are set * (i.e. it needs to be in this second loop) */ while ((job_ptr = list_next(job_iterator))) { if (select_g_select_nodeinfo_set(job_ptr) != SLURM_SUCCESS) { error("select_g_select_nodeinfo_set(%pJ): %m", job_ptr); } } list_iterator_destroy(job_iterator); last_job_update = now; } static int _reset_detail_bitmaps(job_record_t *job_ptr) { if (job_ptr->details == NULL) return SLURM_SUCCESS; FREE_NULL_BITMAP(job_ptr->details->req_node_bitmap); if ((job_ptr->details->req_nodes) && (node_name2bitmap(job_ptr->details->req_nodes, false, &job_ptr->details->req_node_bitmap))) { error("Invalid req_nodes (%s) for %pJ", job_ptr->details->req_nodes, job_ptr); return SLURM_ERROR; } FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); if ((job_ptr->details->exc_nodes) && (node_name2bitmap(job_ptr->details->exc_nodes, true, &job_ptr->details->exc_node_bitmap))) { error("Invalid exc_nodes (%s) for %pJ", job_ptr->details->exc_nodes, job_ptr); return SLURM_ERROR; } return SLURM_SUCCESS; } static void _reset_step_bitmaps(job_record_t *job_ptr) { ListIterator step_iterator; step_record_t *step_ptr; step_iterator = list_iterator_create (job_ptr->step_list); while ((step_ptr = list_next(step_iterator))) { if (step_ptr->state < JOB_RUNNING) continue; FREE_NULL_BITMAP(step_ptr->step_node_bitmap); if (step_ptr->step_layout && step_ptr->step_layout->node_list && (node_name2bitmap(step_ptr->step_layout->node_list, false, &step_ptr->step_node_bitmap))) { error("Invalid step_node_list (%s) for %pS", step_ptr->step_layout->node_list, step_ptr); delete_step_record (job_ptr, step_ptr->step_id); } if (step_ptr->step_node_bitmap == NULL) { error("Missing node_list for %pS", step_ptr); delete_step_record (job_ptr, step_ptr->step_id); } } list_iterator_destroy (step_iterator); return; } /* update first assigned job id as needed on reconfigure */ void reset_first_job_id(void) { xassert(verify_lock(CONF_LOCK, READ_LOCK)); job_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); } /* * Return the next available job_id to be used. * * IN test_only - if true, doesn't advance the job_id sequence, just returns * what the next job id will be. * RET a valid job_id or SLURM_ERROR if all job_ids are exhausted. */ extern uint32_t get_next_job_id(bool test_only) { int i; uint32_t new_id, max_jobs, tmp_id_sequence; xassert(verify_lock(JOB_LOCK, READ_LOCK)); xassert(test_only || verify_lock(JOB_LOCK, WRITE_LOCK)); xassert(verify_lock(FED_LOCK, READ_LOCK)); max_jobs = slurmctld_conf.max_job_id - slurmctld_conf.first_job_id; tmp_id_sequence = MAX(job_id_sequence, slurmctld_conf.first_job_id); /* Ensure no conflict in job id if we roll over 32 bits */ for (i = 0; i < max_jobs; i++) { if (++tmp_id_sequence >= slurmctld_conf.max_job_id) tmp_id_sequence = slurmctld_conf.first_job_id; new_id = fed_mgr_get_job_id(tmp_id_sequence); if (find_job_record(new_id)) continue; if (_dup_job_file_test(new_id)) continue; if (!test_only) job_id_sequence = tmp_id_sequence; return new_id; } error("We have exhausted our supply of valid job id values. " "FirstJobId=%u MaxJobId=%u", slurmctld_conf.first_job_id, slurmctld_conf.max_job_id); return SLURM_ERROR; } /* * _set_job_id - set a default job_id, ensure that it is unique * IN job_ptr - pointer to the job_record */ static int _set_job_id(job_record_t *job_ptr) { uint32_t new_id; xassert(job_ptr); xassert (job_ptr->magic == JOB_MAGIC); if ((new_id = get_next_job_id(false)) != SLURM_ERROR) { job_ptr->job_id = new_id; /* When we get a new job id might as well make sure * the db_index is 0 since there is no way it will be * correct otherwise :). */ job_ptr->db_index = 0; return SLURM_SUCCESS; } job_ptr->job_id = NO_VAL; return EAGAIN; } /* * set_job_prio - set a default job priority * IN job_ptr - pointer to the job_record */ extern void set_job_prio(job_record_t *job_ptr) { uint32_t relative_prio; xassert(job_ptr); xassert (job_ptr->magic == JOB_MAGIC); if (IS_JOB_FINISHED(job_ptr)) return; job_ptr->priority = slurm_sched_g_initial_priority(lowest_prio, job_ptr); if ((job_ptr->priority == 0) || (job_ptr->direct_set_prio)) return; relative_prio = job_ptr->priority; if (job_ptr->details && (job_ptr->details->nice != NICE_OFFSET)) { int64_t offset = job_ptr->details->nice; offset -= NICE_OFFSET; relative_prio += offset; } lowest_prio = MIN(relative_prio, lowest_prio); } /* After recovering job state, if using priority/basic then we increment the * priorities of all jobs to avoid decrementing the base down to zero */ extern void sync_job_priorities(void) { ListIterator job_iterator; job_record_t *job_ptr; uint32_t prio_boost = 0; if ((highest_prio != 0) && (highest_prio < TOP_PRIORITY)) prio_boost = TOP_PRIORITY - highest_prio; if (xstrcmp(slurmctld_conf.priority_type, "priority/basic") || (prio_boost < 1000000)) return; job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { if ((job_ptr->priority) && (job_ptr->direct_set_prio == 0)) job_ptr->priority += prio_boost; } list_iterator_destroy(job_iterator); lowest_prio += prio_boost; } /* * _top_priority - determine if any other job has a higher priority than the * specified job * IN job_ptr - pointer to selected job * RET true if selected job has highest priority */ static bool _top_priority(job_record_t *job_ptr, uint32_t het_job_offset) { struct job_details *detail_ptr = job_ptr->details; time_t now = time(NULL); int pend_time; bool top; if (job_ptr->priority == 0) /* user held */ top = false; else { ListIterator job_iterator; job_record_t *job_ptr2; top = true; /* assume top priority until found otherwise */ job_iterator = list_iterator_create(job_list); while ((job_ptr2 = list_next(job_iterator))) { if (job_ptr2 == job_ptr) continue; if ((het_job_offset != NO_VAL) && (job_ptr->job_id == (job_ptr2->job_id + het_job_offset))) continue; if (!IS_JOB_PENDING(job_ptr2)) continue; if (IS_JOB_COMPLETING(job_ptr2)) { /* Job is hung in pending & completing state, * indicative of job requeue */ continue; } if (bf_min_age_reserve) { if (job_ptr2->details->begin_time == 0) continue; pend_time = difftime(now, job_ptr2-> details->begin_time); if (pend_time < bf_min_age_reserve) continue; } if (!acct_policy_job_runnable_state(job_ptr2) || !misc_policy_job_runnable_state(job_ptr2) || !part_policy_job_runnable_state(job_ptr2) || !job_independent(job_ptr2)) continue; if (!xstrcmp(job_ptr2->resv_name, job_ptr->resv_name) || (job_ptr2->resv_ptr && (job_ptr->warn_time <= job_ptr2->resv_ptr->max_start_delay) && (job_ptr->warn_flags & KILL_JOB_RESV))) { /* same reservation */ if (job_ptr2->priority <= job_ptr->priority) continue; top = false; break; } else if ((job_ptr2->resv_name && (!job_ptr->resv_name)) || ((!job_ptr2->resv_name) && job_ptr->resv_name)) continue; /* different reservation */ if (bb_g_job_test_stage_in(job_ptr2, true) != 1) continue; /* Waiting for buffer */ if (job_ptr2->part_ptr == job_ptr->part_ptr) { /* same partition */ if (job_ptr2->priority <= job_ptr->priority) continue; top = false; break; } if (bit_overlap_any(job_ptr->part_ptr->node_bitmap, job_ptr2->part_ptr->node_bitmap) == 0) continue; /* no node overlap in partitions */ if ((job_ptr2->part_ptr->priority_tier > job_ptr ->part_ptr->priority_tier) || ((job_ptr2->part_ptr->priority_tier == job_ptr ->part_ptr->priority_tier) && (job_ptr2->priority > job_ptr->priority))) { top = false; break; } } list_iterator_destroy(job_iterator); } if ((!top) && detail_ptr) { /* not top prio */ if (job_ptr->priority == 0) { /* user/admin hold */ if (job_ptr->state_reason != FAIL_BAD_CONSTRAINTS && (job_ptr->state_reason != WAIT_RESV_DELETED) && (job_ptr->state_reason != FAIL_BURST_BUFFER_OP) && (job_ptr->state_reason != FAIL_ACCOUNT) && (job_ptr->state_reason != FAIL_QOS) && (job_ptr->state_reason != WAIT_HELD) && (job_ptr->state_reason != WAIT_HELD_USER) && job_ptr->state_reason != WAIT_MAX_REQUEUE) { job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); } } else if (job_ptr->state_reason == WAIT_NO_REASON && het_job_offset == NO_VAL) { job_ptr->state_reason = WAIT_PRIORITY; xfree(job_ptr->state_desc); } } return top; } static void _merge_job_licenses(job_record_t *shrink_job_ptr, job_record_t *expand_job_ptr) { xassert(shrink_job_ptr); xassert(expand_job_ptr); /* FIXME: do we really need to update accounting here? It * might already happen */ if (!shrink_job_ptr->licenses) /* No licenses to add */ return; if (!expand_job_ptr->licenses) { /* Just transfer licenses */ expand_job_ptr->licenses = shrink_job_ptr->licenses; shrink_job_ptr->licenses = NULL; FREE_NULL_LIST(expand_job_ptr->license_list); expand_job_ptr->license_list = shrink_job_ptr->license_list; shrink_job_ptr->license_list = NULL; return; } /* Merge the license information into expanding job */ xstrcat(expand_job_ptr->licenses, ","); xstrcat(expand_job_ptr->licenses, shrink_job_ptr->licenses); xfree(shrink_job_ptr->licenses); FREE_NULL_LIST(expand_job_ptr->license_list); FREE_NULL_LIST(shrink_job_ptr->license_list); license_job_merge(expand_job_ptr); return; } static void _hold_job_rec(job_record_t *job_ptr, uid_t uid) { int i, j; job_ptr->direct_set_prio = 1; job_ptr->priority = 0; if (IS_JOB_PENDING(job_ptr)) acct_policy_remove_accrue_time(job_ptr, false); if (job_ptr->part_ptr_list && job_ptr->priority_array) { j = list_count(job_ptr->part_ptr_list); for (i = 0; i < j; i++) { job_ptr->priority_array[i] = 0; } } sched_info("%s: hold on %pJ by uid %u", __func__, job_ptr, uid); } static void _hold_job(job_record_t *job_ptr, uid_t uid) { job_record_t *het_job_leader = NULL, *het_job; ListIterator iter; if (job_ptr->het_job_id && _get_whole_hetjob()) het_job_leader = find_job_record(job_ptr->het_job_id); if (het_job_leader && het_job_leader->het_job_list) { iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) _hold_job_rec(het_job, uid); list_iterator_destroy(iter); return; } _hold_job_rec(job_ptr, uid); } static void _release_job_rec(job_record_t *job_ptr, uid_t uid) { time_t now = time(NULL); if (job_ptr->details && (job_ptr->details->begin_time < now)) job_ptr->details->begin_time = 0; job_ptr->direct_set_prio = 0; set_job_prio(job_ptr); job_ptr->state_reason = WAIT_NO_REASON; job_ptr->state_reason_prev = WAIT_NO_REASON; job_ptr->job_state &= ~JOB_SPECIAL_EXIT; xfree(job_ptr->state_desc); job_ptr->exit_code = 0; fed_mgr_job_requeue(job_ptr); /* submit sibling jobs */ sched_info("%s: release hold on %pJ by uid %u", __func__, job_ptr, uid); } static void _release_job(job_record_t *job_ptr, uid_t uid) { job_record_t *het_job_leader = NULL, *het_job; ListIterator iter; if (job_ptr->het_job_id && _get_whole_hetjob()) het_job_leader = find_job_record(job_ptr->het_job_id); if (het_job_leader && het_job_leader->het_job_list) { iter = list_iterator_create(het_job_leader->het_job_list); while ((het_job = list_next(iter))) _release_job_rec(het_job, uid); list_iterator_destroy(iter); return; } _release_job_rec(job_ptr, uid); } /* * Gets a new association giving priority to the given parameters in job_desc, * and if not possible using the job_ptr ones. * IN job_desc: The new job description to use for getting the assoc_ptr. * IN job_ptr: The original job_ptr to use when parameters are not in job_desc. * RET assoc_rec, the new association combining the most updated information * from job_desc. */ static slurmdb_assoc_rec_t *_retrieve_new_assoc(job_desc_msg_t *job_desc, job_record_t *job_ptr) { slurmdb_assoc_rec_t assoc_rec, *assoc_ptr = NULL; memset(&assoc_rec, 0, sizeof(assoc_rec)); if (job_desc->partition) { part_record_t *part_ptr = NULL; int error_code = _get_job_parts(job_desc, &part_ptr, NULL, NULL); /* We don't need this we only care about part_ptr */ if (error_code != SLURM_SUCCESS) { errno = error_code; return NULL; } else if (!(part_ptr->state_up & PARTITION_SUBMIT)) { errno = ESLURM_PARTITION_NOT_AVAIL; return NULL; } assoc_rec.partition = part_ptr->name; } else if (job_ptr->part_ptr) assoc_rec.partition = job_ptr->part_ptr->name; if (job_desc->account) assoc_rec.acct = job_desc->account; else assoc_rec.acct = job_ptr->account; assoc_rec.uid = job_ptr->user_id; if (assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, &assoc_ptr, false)) { info("%s: invalid account %s for %pJ", __func__, assoc_rec.acct, job_ptr); errno = ESLURM_INVALID_ACCOUNT; return NULL; } else if (association_based_accounting && !assoc_ptr && !(accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) && assoc_rec.acct) { /* if not enforcing associations we want to look for * the default account and use it to avoid getting * trash in the accounting records. */ assoc_rec.acct = NULL; (void) assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, &assoc_ptr, false); } return assoc_ptr; } /* Allocate nodes to new job. Old job info will be cleared at epilog complete */ static void _realloc_nodes(job_record_t *job_ptr, bitstr_t *orig_node_bitmap) { int i, i_first, i_last; node_record_t *node_ptr; xassert(job_ptr); xassert(orig_node_bitmap); if (!job_ptr->job_resrcs || !job_ptr->job_resrcs->node_bitmap) return; i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap); if (i_first >= 0) i_last = bit_fls(job_ptr->job_resrcs->node_bitmap); else i_last = -1; for (i = i_first; i <= i_last; i++) { if (!bit_test(job_ptr->job_resrcs->node_bitmap, i) || bit_test(orig_node_bitmap, i)) continue; node_ptr = node_record_table_ptr + i; make_node_alloc(node_ptr, job_ptr); } } extern bool permit_job_expansion(void) { static time_t sched_update = 0; static bool permit_job_expansion = false; if (sched_update != slurmctld_conf.last_update) { char *sched_params = slurm_get_sched_params(); sched_update = slurmctld_conf.last_update; if (xstrcasestr(sched_params, "permit_job_expansion")) permit_job_expansion = true; else permit_job_expansion = false; xfree(sched_params); } return permit_job_expansion; } extern bool permit_job_shrink(void) { static time_t sched_update = 0; static bool permit_job_shrink = false; if (sched_update != slurmctld_conf.last_update) { char *sched_params = slurm_get_sched_params(); sched_update = slurmctld_conf.last_update; if (xstrcasestr(sched_params, "disable_job_shrink")) permit_job_shrink = false; else permit_job_shrink = true; xfree(sched_params); } return permit_job_shrink; } static int _update_job(job_record_t *job_ptr, job_desc_msg_t *job_specs, uid_t uid) { int error_code = SLURM_SUCCESS; enum job_state_reason fail_reason; bool operator = false; bool is_coord_oldacc = false, is_coord_newacc = false; uint32_t save_min_nodes = 0, save_max_nodes = 0; uint32_t save_min_cpus = 0, save_max_cpus = 0; struct job_details *detail_ptr; part_record_t *new_part_ptr = NULL, *use_part_ptr = NULL; bitstr_t *exc_bitmap = NULL, *new_req_bitmap = NULL; time_t now = time(NULL); multi_core_data_t *mc_ptr = NULL; bool update_accounting = false, new_req_bitmap_given = false; acct_policy_limit_set_t acct_policy_limit_set; uint16_t tres[slurmctld_tres_cnt]; bool acct_limit_already_exceeded; bool tres_changed = false; int tres_pos; uint64_t tres_req_cnt[slurmctld_tres_cnt]; bool tres_req_cnt_set = false; List gres_list = NULL; List license_list = NULL; List part_ptr_list = NULL; uint32_t orig_time_limit; bool gres_update = false; slurmdb_assoc_rec_t *new_assoc_ptr = NULL, *use_assoc_ptr = NULL; slurmdb_qos_rec_t *new_qos_ptr = NULL, *use_qos_ptr = NULL; slurmctld_resv_t *new_resv_ptr = NULL; uint32_t user_site_factor; assoc_mgr_lock_t locks = { .tres = READ_LOCK }; /* * This means we are in the middle of requesting the db_inx from the * database. So we can't update right now. You should try again outside * the job_write lock in a second or so. */ if (job_ptr->db_index == NO_VAL64) return ESLURM_JOB_SETTING_DB_INX; operator = validate_operator(uid); if (job_specs->burst_buffer) { /* * burst_buffer contents are validated at job submit time and * data is possibly being staged at later times. It can not * be changed except to clear the value on a completed job and * purge the record in order to recover from a failure mode */ if (IS_JOB_COMPLETED(job_ptr) && operator && (job_specs->burst_buffer[0] == '\0')) { xfree(job_ptr->burst_buffer); last_job_update = now; } else { error_code = ESLURM_NOT_SUPPORTED; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->array_inx && job_ptr->array_recs) { int throttle; throttle = strtoll(job_specs->array_inx, (char **) NULL, 10); if (throttle >= 0) { info("%s: set max_run_tasks to %d for job array %pJ", __func__, throttle, job_ptr); job_ptr->array_recs->max_run_tasks = throttle; } else { info("%s: invalid max_run_tasks of %d for job array %pJ, ignored", __func__, throttle, job_ptr); error_code = ESLURM_BAD_TASK_COUNT; } /* * Even if the job is complete, permit changing * ArrayTaskThrottle for other elements of the task array */ if (IS_JOB_FINISHED(job_ptr)) goto fini; } if (IS_JOB_FINISHED(job_ptr)) { error_code = ESLURM_JOB_FINISHED; goto fini; } /* * Validate before job_submit_plugin_modify() so that the job_submit * plugin can make changes to the field without triggering an auth * issue. */ if (job_specs->admin_comment && !validate_super_user(uid)) { error("Attempt to change admin_comment for %pJ", job_ptr); error_code = ESLURM_ACCESS_DENIED; goto fini; } /* Save before submit plugin potentially modifies it. */ user_site_factor = job_specs->site_factor; if (job_specs->user_id == NO_VAL) { /* * Used by job_submit/lua to find default partition and * access control logic below to validate partition change */ job_specs->user_id = job_ptr->user_id; } error_code = job_submit_plugin_modify(job_specs, job_ptr, (uint32_t) uid); if (error_code != SLURM_SUCCESS) return error_code; error_code = node_features_g_job_valid(job_specs->features); if (error_code != SLURM_SUCCESS) return error_code; error_code = _test_job_desc_fields(job_specs); if (error_code != SLURM_SUCCESS) return error_code; memset(&acct_policy_limit_set, 0, sizeof(acct_policy_limit_set)); acct_policy_limit_set.tres = tres; if (operator) { /* set up the acct_policy if we are at least an operator */ for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++) acct_policy_limit_set.tres[tres_pos] = ADMIN_SET_LIMIT; acct_policy_limit_set.time = ADMIN_SET_LIMIT; acct_policy_limit_set.qos = ADMIN_SET_LIMIT; } else memset(tres, 0, sizeof(tres)); /* Check authorization for modifying this job */ is_coord_oldacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account); is_coord_newacc = assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_specs->account); if ((job_ptr->user_id != uid) && !operator) { /* * Fail if we are not coordinators of the current account or * if we are changing an account and we are not coordinators * of both src and dest accounts. */ if (!is_coord_oldacc || (!is_coord_newacc && job_specs->account)) { error("Security violation, JOB_UPDATE RPC from uid %d", uid); return ESLURM_USER_ID_MISSING; } } detail_ptr = job_ptr->details; if (detail_ptr) mc_ptr = detail_ptr->mc_ptr; last_job_update = now; /* * Check to see if the new requested job_specs exceeds any * existing limit. If it passes, cool, we will check the new * association/qos/part later in the code and fail if it is wrong. * * If it doesn't pass this mean some limit was exceededed before the * update request so let's keep the user continue screwing up herself * with the limit if it is what she wants. We do this by not exiting * on the later call to acct_policy_validate() if it fails. * * We will also prevent the update to return an error code that is * confusing since many things could successfully update and we are now * just already violating a limit. The job won't be allowed to run, * but it will allow the update to happen which is most likely what * was desired. * * Changes in between this check and the next acct_policy_validate() * will not be constrained to accounting enforce limits. */ orig_time_limit = job_specs->time_limit; memcpy(tres_req_cnt, job_ptr->tres_req_cnt, sizeof(tres_req_cnt)); job_specs->tres_req_cnt = tres_req_cnt; tres_req_cnt_set = true; acct_limit_already_exceeded = false; if (!operator && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) { if (!acct_policy_validate(job_specs, job_ptr->part_ptr, job_ptr->assoc_ptr, job_ptr->qos_ptr, NULL, &acct_policy_limit_set, true)) { debug("%s: already exceeded association's cpu, node, " "memory or time limit for user %u", __func__, job_specs->user_id); acct_limit_already_exceeded = true; } job_specs->time_limit = orig_time_limit; } /* * The partition, assoc, qos, reservation, and req_node_bitmap all have * to be set before checking later. So here we set them into temporary * variables set in the job way later. */ if (job_specs->partition && !xstrcmp(job_specs->partition, job_ptr->partition)) { sched_debug("%s: new partition identical to old partition %pJ", __func__, job_ptr); xfree(job_specs->partition); } else if (job_specs->partition) { if (!IS_JOB_PENDING(job_ptr)) { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } error_code = _get_job_parts(job_specs, &new_part_ptr, &part_ptr_list, NULL); if (error_code != SLURM_SUCCESS) ; else if ((new_part_ptr->state_up & PARTITION_SUBMIT) == 0) error_code = ESLURM_PARTITION_NOT_AVAIL; else if (!part_ptr_list && !xstrcmp(new_part_ptr->name, job_ptr->partition)) { sched_debug("%s: 2 new partition identical to old partition %pJ", __func__, job_ptr); xfree(job_specs->partition); new_part_ptr = NULL; } if (error_code != SLURM_SUCCESS) goto fini; } use_part_ptr = new_part_ptr ? new_part_ptr : job_ptr->part_ptr; /* Check the account and the partition as both affect the association */ if (job_specs->account || job_specs->partition) { if (!IS_JOB_PENDING(job_ptr)) error_code = ESLURM_JOB_NOT_PENDING; else { new_assoc_ptr = _retrieve_new_assoc(job_specs, job_ptr); if (!new_assoc_ptr) error_code = errno; else if (new_assoc_ptr == job_ptr->assoc_ptr) { new_assoc_ptr = NULL; sched_debug("%s: new association identical to old association %u", __func__, job_ptr->job_id); } /* * Clear errno that may have been set by * _retrieve_new_assoc. */ errno = 0; } if (error_code != SLURM_SUCCESS) goto fini; } use_assoc_ptr = new_assoc_ptr ? new_assoc_ptr : job_ptr->assoc_ptr; if (job_specs->qos) { slurmdb_qos_rec_t qos_rec; char *resv_name; if (job_specs->reservation && job_specs->reservation[0] != '\0') resv_name = job_specs->reservation; else resv_name = job_ptr->resv_name; memset(&qos_rec, 0, sizeof(qos_rec)); /* If the qos is blank that means we want the default */ if (job_specs->qos[0]) qos_rec.name = job_specs->qos; new_qos_ptr = _determine_and_validate_qos( resv_name, use_assoc_ptr, operator, &qos_rec, &error_code, false, LOG_LEVEL_ERROR); if ((error_code == SLURM_SUCCESS) && new_qos_ptr) { if (job_ptr->qos_ptr == new_qos_ptr) { sched_debug("%s: new QOS identical to old QOS %pJ", __func__, job_ptr); new_qos_ptr = NULL; } else if (!IS_JOB_PENDING(job_ptr)) { error_code = ESLURM_JOB_NOT_PENDING; new_qos_ptr = NULL; } } if (error_code != SLURM_SUCCESS) goto fini; } use_qos_ptr = new_qos_ptr ? new_qos_ptr : job_ptr->qos_ptr; if (job_specs->bitflags & RESET_ACCRUE_TIME) { if (!IS_JOB_PENDING(job_ptr) || !detail_ptr) { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } else acct_policy_remove_accrue_time(job_ptr, false); } /* * Must check req_nodes to set the job_ptr->details->req_node_bitmap * before we validate it later. */ if (job_specs->req_nodes && (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { /* * Use req_nodes to change the nodes associated with a running * for lack of other field in the job request to use */ if (!permit_job_shrink()) { error("%s: request to shrink %pJ denied by configuration", __func__, job_ptr); error_code = ESLURM_NOT_SUPPORTED; goto fini; } else if ((job_specs->req_nodes[0] == '\0') || node_name2bitmap(job_specs->req_nodes, false, &new_req_bitmap) || !bit_super_set(new_req_bitmap, job_ptr->node_bitmap) || (job_ptr->details && job_ptr->details->expanding_jobid)) { sched_info("%s: Invalid node list (%s) for %pJ update", __func__, job_specs->req_nodes, job_ptr); error_code = ESLURM_INVALID_NODE_NAME; goto fini; } else if (new_req_bitmap) { int i, i_first, i_last; node_record_t *node_ptr; bitstr_t *rem_nodes; /* * They requested a new list of nodes for the job. If * the batch host isn't in this list, then deny this * request. */ if (job_ptr->batch_flag) { bitstr_t *batch_host_bitmap; if (node_name2bitmap(job_ptr->batch_host, false, &batch_host_bitmap)) error("%s: Invalid batch host %s for %pJ; this should never happen", __func__, job_ptr->batch_host, job_ptr); else if (!bit_overlap_any(batch_host_bitmap, new_req_bitmap)) { error("%s: Batch host %s for %pJ is not in the requested node list %s. You cannot remove the batch host from a job when resizing.", __func__, job_ptr->batch_host, job_ptr, job_specs->req_nodes); error_code = ESLURM_INVALID_NODE_NAME; bit_free(batch_host_bitmap); goto fini; } else bit_free(batch_host_bitmap); } sched_info("%s: setting nodes to %s for %pJ", __func__, job_specs->req_nodes, job_ptr); job_pre_resize_acctg(job_ptr); i_first = bit_ffs(job_ptr->node_bitmap); if (i_first >= 0) i_last = bit_fls(job_ptr->node_bitmap); else i_last = -2; rem_nodes = bit_alloc(bit_size(job_ptr->node_bitmap)); for (i = i_first; i <= i_last; i++) { if (bit_test(new_req_bitmap, i) || !bit_test(job_ptr->node_bitmap, i)) continue; bit_set(rem_nodes, i); } #ifndef HAVE_FRONT_END abort_job_on_nodes(job_ptr, rem_nodes); #endif for (i = i_first; i <= i_last; i++) { if (!bit_test(rem_nodes, i)) continue; node_ptr = node_record_table_ptr + i; kill_step_on_node(job_ptr, node_ptr, false); excise_node_from_job(job_ptr, node_ptr); } bit_free(rem_nodes); (void) gs_job_start(job_ptr); gres_build_job_details(job_ptr->gres_list, &job_ptr->gres_detail_cnt, &job_ptr->gres_detail_str, &job_ptr->gres_used); job_post_resize_acctg(job_ptr); /* * Since job_post_resize_acctg will restart * things, don't do it again. */ update_accounting = false; } else { update_accounting = true; } FREE_NULL_BITMAP(new_req_bitmap); } else if (job_specs->req_nodes) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else if (job_specs->req_nodes[0] == '\0') new_req_bitmap_given = true; else { if (node_name2bitmap(job_specs->req_nodes, false, &new_req_bitmap)) { sched_info("%s: Invalid node list for job_update: %s", __func__, job_specs->req_nodes); FREE_NULL_BITMAP(new_req_bitmap); error_code = ESLURM_INVALID_NODE_NAME; } else new_req_bitmap_given = true; } } if (error_code != SLURM_SUCCESS) goto fini; /* this needs to be after partition and QOS checks */ if (job_specs->reservation && (!xstrcmp(job_specs->reservation, job_ptr->resv_name) || (!job_ptr->resv_name && job_specs->reservation[0] == '\0'))) { sched_debug("%s: new reservation identical to old reservation %pJ", __func__, job_ptr); } else if (job_specs->reservation) { if (!IS_JOB_PENDING(job_ptr) && !IS_JOB_RUNNING(job_ptr)) { error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING; } else { job_record_t tmp_job_rec; memcpy(&tmp_job_rec, job_ptr, sizeof(job_record_t)); tmp_job_rec.resv_name = xstrdup(job_specs->reservation); tmp_job_rec.resv_ptr = NULL; tmp_job_rec.part_ptr = use_part_ptr; tmp_job_rec.qos_ptr = use_qos_ptr; tmp_job_rec.assoc_ptr = use_assoc_ptr; error_code = validate_job_resv(&tmp_job_rec); /* * It doesn't matter what this is, just set it as * failure will be NULL. */ new_resv_ptr = tmp_job_rec.resv_ptr; /* * Make sure this job isn't using a partition or QOS * that requires it to be in a reservation. */ if ((error_code == SLURM_SUCCESS) && !new_resv_ptr) { if (use_part_ptr && use_part_ptr->flags & PART_FLAG_REQ_RESV) error_code = ESLURM_ACCESS_DENIED; if (use_qos_ptr && use_qos_ptr->flags & QOS_FLAG_REQ_RESV) error_code = ESLURM_INVALID_QOS; } xfree(tmp_job_rec.resv_name); } if (error_code != SLURM_SUCCESS) goto fini; } if (job_specs->cpus_per_tres || job_specs->tres_per_job || job_specs->tres_per_node || job_specs->tres_per_socket || job_specs->tres_per_task || job_specs->mem_per_tres) gres_update = true; if (gres_update) { uint16_t orig_ntasks_per_socket = NO_VAL16; if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) || (detail_ptr->expanding_jobid != 0)) { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } if (job_specs->num_tasks == NO_VAL) job_specs->num_tasks = detail_ptr->num_tasks; if (job_specs->min_nodes == NO_VAL) job_specs->min_nodes = detail_ptr->min_nodes; if (job_specs->max_nodes == NO_VAL) job_specs->max_nodes = detail_ptr->max_nodes; if (job_specs->ntasks_per_node == NO_VAL16) job_specs->ntasks_per_node = detail_ptr->ntasks_per_node; if ((job_specs->ntasks_per_socket == NO_VAL16) && (detail_ptr->mc_ptr) && (detail_ptr->mc_ptr->ntasks_per_socket != INFINITE16)) { job_specs->ntasks_per_socket = mc_ptr->ntasks_per_socket; orig_ntasks_per_socket = job_specs->ntasks_per_socket; } if (job_specs->cpus_per_task == NO_VAL16) job_specs->cpus_per_task = detail_ptr->cpus_per_task; gres_list = gres_plugin_job_state_dup(job_ptr->gres_list); if ((error_code = gres_plugin_job_state_validate( job_specs->cpus_per_tres, job_specs->tres_freq, job_specs->tres_per_job, job_specs->tres_per_node, job_specs->tres_per_socket, job_specs->tres_per_task, job_specs->mem_per_tres, &job_specs->num_tasks, &job_specs->min_nodes, &job_specs->max_nodes, &job_specs->ntasks_per_node, &job_specs->ntasks_per_socket, &job_specs->sockets_per_node, &job_specs->cpus_per_task, &gres_list))) { sched_info("%s: invalid GRES for %pJ", __func__, job_ptr); goto fini; } if (job_specs->num_tasks == detail_ptr->num_tasks) job_specs->num_tasks = NO_VAL; /* Unchanged */ if (job_specs->min_nodes == detail_ptr->min_nodes) job_specs->min_nodes = NO_VAL; /* Unchanged */ if (job_specs->max_nodes == detail_ptr->max_nodes) job_specs->max_nodes = NO_VAL; /* Unchanged */ if (job_specs->ntasks_per_node == detail_ptr->ntasks_per_node) job_specs->ntasks_per_node = NO_VAL16; /* Unchanged */ if (job_specs->ntasks_per_socket == orig_ntasks_per_socket) job_specs->ntasks_per_socket = NO_VAL16; /* Unchanged */ if (job_specs->cpus_per_task == detail_ptr->cpus_per_task) job_specs->cpus_per_task = NO_VAL16; /* Unchanged */ } if (gres_update) { gres_set_job_tres_cnt(gres_list, detail_ptr->min_nodes, job_specs->tres_req_cnt, false); } if ((job_specs->min_nodes != NO_VAL) && (job_specs->min_nodes != INFINITE)) { uint32_t min_cpus = (job_specs->pn_min_cpus != NO_VAL16 ? job_specs->pn_min_cpus : detail_ptr->pn_min_cpus) * job_specs->min_nodes; uint32_t num_cpus = job_specs->min_cpus != NO_VAL ? job_specs->min_cpus : job_ptr->tres_req_cnt[TRES_ARRAY_CPU]; uint32_t num_tasks = job_specs->num_tasks != NO_VAL ? job_specs->num_tasks : detail_ptr->num_tasks; if (!num_tasks) { num_tasks = detail_ptr->min_nodes; } else if (num_tasks < job_specs->min_nodes) { info("%s: adjusting num_tasks (prev: %u) to be at least min_nodes: %u", __func__, num_tasks, job_specs->min_nodes); num_tasks = job_specs->min_nodes; if (IS_JOB_PENDING(job_ptr)) job_specs->num_tasks = num_tasks; } num_tasks *= job_specs->cpus_per_task != NO_VAL16 ? job_specs->cpus_per_task : detail_ptr->cpus_per_task; num_tasks = MAX(num_tasks, min_cpus); if (num_tasks > num_cpus) { info("%s: adjusting min_cpus (prev: %u) to be at least : %u", __func__, num_cpus, num_tasks); job_specs->min_cpus = num_tasks; job_specs->pn_min_memory = job_specs->pn_min_memory != NO_VAL64 ? job_specs->pn_min_memory : detail_ptr->pn_min_memory; } assoc_mgr_lock(&locks); if (!job_specs->licenses) { license_set_job_tres_cnt(job_ptr->license_list, job_specs->tres_req_cnt, true); } assoc_mgr_unlock(&locks); job_specs->tres_req_cnt[TRES_ARRAY_NODE] = job_specs->min_nodes; } if (job_specs->min_cpus != NO_VAL) job_specs->tres_req_cnt[TRES_ARRAY_CPU] = job_specs->min_cpus; else if ((job_specs->pn_min_cpus != NO_VAL16) && (job_specs->pn_min_cpus != 0)) { job_specs->tres_req_cnt[TRES_ARRAY_CPU] = job_specs->pn_min_cpus * (job_specs->min_nodes != NO_VAL ? job_specs->min_nodes : detail_ptr ? detail_ptr->min_nodes : 1); job_specs->min_cpus = job_specs->tres_req_cnt[TRES_ARRAY_CPU]; } job_specs->tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem(NULL, job_specs->pn_min_memory, job_specs->tres_req_cnt[TRES_ARRAY_CPU] ? job_specs->tres_req_cnt[TRES_ARRAY_CPU] : job_ptr->tres_req_cnt[TRES_ARRAY_CPU], job_specs->min_nodes != NO_VAL ? job_specs->min_nodes : detail_ptr ? detail_ptr->min_nodes : 1); if (job_specs->licenses && !xstrcmp(job_specs->licenses, job_ptr->licenses)) { sched_debug("%s: new licenses identical to old licenses \"%s\"", __func__, job_ptr->licenses); xfree(job_specs->licenses); } else if (job_specs->licenses) { bool valid, pending = IS_JOB_PENDING(job_ptr); license_list = license_validate(job_specs->licenses, true, true, pending ? job_specs->tres_req_cnt : NULL, &valid); if (!valid) { sched_info("%s: invalid licenses: %s", __func__, job_specs->licenses); error_code = ESLURM_INVALID_LICENSES; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->exc_nodes && detail_ptr && !xstrcmp(job_specs->exc_nodes, detail_ptr->exc_nodes)) { sched_debug("%s: new exc_nodes identical to old exc_nodes %s", __func__, job_specs->exc_nodes); } else if (job_specs->exc_nodes) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else if (job_specs->exc_nodes[0] == '\0') { xfree(detail_ptr->exc_nodes); FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap); } else { if (node_name2bitmap(job_specs->exc_nodes, false, &exc_bitmap)) { sched_error("%s: Invalid node list for update of %pJ: %s", __func__, job_ptr, job_specs->exc_nodes); FREE_NULL_BITMAP(exc_bitmap); error_code = ESLURM_INVALID_NODE_NAME; } if (exc_bitmap) { xfree(detail_ptr->exc_nodes); detail_ptr->exc_nodes = xstrdup(job_specs->exc_nodes); FREE_NULL_BITMAP(detail_ptr->exc_node_bitmap); detail_ptr->exc_node_bitmap = exc_bitmap; sched_info("%s: setting exc_nodes to %s for %pJ", __func__, job_specs->exc_nodes, job_ptr); } } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->min_nodes == INFINITE) { /* Used by scontrol just to get current configuration info */ job_specs->min_nodes = NO_VAL; } if ((job_specs->min_nodes != NO_VAL) && (job_specs->min_nodes > job_ptr->node_cnt) && !permit_job_expansion() && (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { info("%s: Change of size for %pJ not supported", __func__, job_ptr); error_code = ESLURM_NOT_SUPPORTED; goto fini; } if (job_specs->req_switch != NO_VAL) { job_ptr->req_switch = job_specs->req_switch; info("%s: Change of switches to %u %pJ", __func__, job_specs->req_switch, job_ptr); } if (job_specs->wait4switch != NO_VAL) { job_ptr->wait4switch = _max_switch_wait(job_specs->wait4switch); info("%s: Change of switch wait to %u secs %pJ", __func__, job_ptr->wait4switch, job_ptr); } if (job_specs->admin_comment) { if (!validate_super_user(uid)) { error("%s: Attempt to change admin_comment for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } else { xfree(job_ptr->admin_comment); job_ptr->admin_comment = xstrdup(job_specs->admin_comment); info("%s: setting admin_comment to %s for %pJ", __func__, job_ptr->admin_comment, job_ptr); } } if (job_specs->comment) { xfree(job_ptr->comment); job_ptr->comment = xstrdup(job_specs->comment); info("%s: setting comment to %s for %pJ", __func__, job_ptr->comment, job_ptr); } if (error_code != SLURM_SUCCESS) goto fini; /* * Now that we know what the new part, qos, and association are going * to be lets check the limits. * If a limit was already exceeded before this update * request, let's assume it is expected and allow the change to happen. */ if (new_qos_ptr || new_assoc_ptr || new_part_ptr) { if (!operator && (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)) { uint32_t acct_reason = 0; char *resv_orig = NULL; bool resv_reset = false, min_reset = false, max_reset = false, time_min_reset = false; if (!acct_policy_validate(job_specs, use_part_ptr, use_assoc_ptr, use_qos_ptr, &acct_reason, &acct_policy_limit_set, true) && !acct_limit_already_exceeded) { info("%s: exceeded association/QOS limit for user %u: %s", __func__, job_specs->user_id, job_reason_string(acct_reason)); error_code = ESLURM_ACCOUNTING_POLICY; goto fini; } /* * We need to set the various parts of job_specs below * to something since _valid_job_part() will validate * them. Note the reservation part is validated in the * sub call to _part_access_check(). */ if (job_specs->min_nodes == NO_VAL) { job_specs->min_nodes = detail_ptr->min_nodes; min_reset = true; } if ((job_specs->max_nodes == NO_VAL) && (detail_ptr->max_nodes != 0)) { job_specs->max_nodes = detail_ptr->max_nodes; max_reset = true; } if ((job_specs->time_min == NO_VAL) && (job_ptr->time_min != 0)) { job_specs->time_min = job_ptr->time_min; time_min_reset = true; } /* * This always gets reset, so don't worry about tracking * it. */ if (job_specs->time_limit == NO_VAL) job_specs->time_limit = job_ptr->time_limit; if (!job_specs->reservation || job_specs->reservation[0] == '\0') { resv_reset = true; resv_orig = job_specs->reservation; job_specs->reservation = job_ptr->resv_name; } if ((error_code = _valid_job_part( job_specs, uid, new_req_bitmap_given ? new_req_bitmap : job_ptr->details->req_node_bitmap, use_part_ptr, job_specs->partition ? part_ptr_list : job_ptr->part_ptr_list, use_assoc_ptr, use_qos_ptr))) goto fini; if (min_reset) job_specs->min_nodes = NO_VAL; if (max_reset) job_specs->max_nodes = NO_VAL; if (time_min_reset) job_specs->time_min = NO_VAL; if (resv_reset) job_specs->reservation = resv_orig; job_specs->time_limit = orig_time_limit; } /* * Since we are successful to this point remove the job from the * old qos/assoc's */ acct_policy_remove_job_submit(job_ptr); acct_policy_remove_accrue_time(job_ptr, false); } if (new_qos_ptr) { /* Change QOS */ job_ptr->qos_id = new_qos_ptr->id; job_ptr->qos_ptr = new_qos_ptr; job_ptr->limit_set.qos = acct_policy_limit_set.qos; if (job_ptr->state_reason == FAIL_QOS) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); } info("%s: setting QOS to %s for %pJ", __func__, new_qos_ptr->name, job_ptr); } if (new_assoc_ptr) { /* Change account/association */ xfree(job_ptr->account); job_ptr->account = xstrdup(new_assoc_ptr->acct); job_ptr->assoc_id = new_assoc_ptr->id; job_ptr->assoc_ptr = new_assoc_ptr; if (job_ptr->state_reason == FAIL_ACCOUNT) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); } info("%s: setting account to %s for %pJ", __func__, job_ptr->account, job_ptr); } if (new_part_ptr) { /* Change partition */ job_ptr->part_ptr = new_part_ptr; FREE_NULL_LIST(job_ptr->part_ptr_list); job_ptr->part_ptr_list = part_ptr_list; part_ptr_list = NULL; /* nothing to free */ _rebuild_part_name_list(job_ptr); /* Rebuilt in priority/multifactor plugin */ xfree(job_ptr->priority_array); info("%s: setting partition to %s for %pJ", __func__, job_specs->partition, job_ptr); } /* Now add the job to the new qos/assoc's */ if (new_qos_ptr || new_assoc_ptr || new_part_ptr) { update_accounting = true; acct_policy_add_job_submit(job_ptr); } if (new_req_bitmap_given) { xfree(detail_ptr->req_nodes); if (job_specs->req_nodes[0] != '\0') detail_ptr->req_nodes = xstrdup(job_specs->req_nodes); FREE_NULL_BITMAP(detail_ptr->req_node_bitmap); detail_ptr->req_node_bitmap = new_req_bitmap; new_req_bitmap = NULL; sched_info("%s: setting req_nodes to %s for %pJ", __func__, job_specs->req_nodes, job_ptr); } if (new_resv_ptr) { job_ptr->resv_name = xstrdup(new_resv_ptr->name); job_ptr->resv_ptr = new_resv_ptr; sched_info("%s: setting reservation to %s for %pJ", __func__, job_ptr->resv_name, job_ptr); update_accounting = true; } else if (job_specs->reservation && job_specs->reservation[0] == '\0' && job_ptr->resv_name) { xfree(job_ptr->resv_name); job_ptr->resv_id = 0; job_ptr->resv_ptr = NULL; sched_info("%s: setting reservation to '' for %pJ", __func__, job_ptr); update_accounting = true; } /* Reset min and max cpu counts as needed, ensure consistency */ if (job_specs->min_cpus != NO_VAL) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else if (job_specs->min_cpus < 1) error_code = ESLURM_INVALID_CPU_COUNT; else { save_min_cpus = detail_ptr->min_cpus; detail_ptr->min_cpus = job_specs->min_cpus; } } if (job_specs->max_cpus != NO_VAL) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else { save_max_cpus = detail_ptr->max_cpus; detail_ptr->max_cpus = job_specs->max_cpus; } } if ((save_min_cpus || save_max_cpus) && detail_ptr->max_cpus && (detail_ptr->max_cpus < detail_ptr->min_cpus)) { error_code = ESLURM_INVALID_CPU_COUNT; if (save_min_cpus) { detail_ptr->min_cpus = save_min_cpus; save_min_cpus = 0; } if (save_max_cpus) { detail_ptr->max_cpus = save_max_cpus; save_max_cpus = 0; } } if (error_code != SLURM_SUCCESS) goto fini; if (save_min_cpus && (detail_ptr->min_cpus != save_min_cpus)) { info("%s: setting min_cpus from %u to %u for %pJ", __func__, save_min_cpus, detail_ptr->min_cpus, job_ptr); job_ptr->limit_set.tres[TRES_ARRAY_CPU] = acct_policy_limit_set.tres[TRES_ARRAY_CPU]; detail_ptr->orig_min_cpus = job_specs->min_cpus; update_accounting = true; } if (save_max_cpus && (detail_ptr->max_cpus != save_max_cpus)) { info("%s: setting max_cpus from %u to %u for %pJ", __func__, save_max_cpus, detail_ptr->max_cpus, job_ptr); /* * Always use the acct_policy_limit_set.* since if set by a * super user it be set correctly */ job_ptr->limit_set.tres[TRES_ARRAY_CPU] = acct_policy_limit_set.tres[TRES_ARRAY_CPU]; detail_ptr->orig_max_cpus = job_specs->max_cpus; update_accounting = true; } if ((job_specs->pn_min_cpus != NO_VAL16) && (job_specs->pn_min_cpus != 0)) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; } else { detail_ptr->pn_min_cpus = job_specs->pn_min_cpus; detail_ptr->orig_pn_min_cpus = job_specs->pn_min_cpus; info("%s: setting pn_min_cpus to %u for %pJ", __func__, job_specs->pn_min_cpus, job_ptr); } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->cpus_per_task != NO_VAL16) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; } else if (detail_ptr->cpus_per_task != job_specs->cpus_per_task) { info("%s: setting cpus_per_task from %u to %u for %pJ", __func__, detail_ptr->cpus_per_task, job_specs->cpus_per_task, job_ptr); detail_ptr->cpus_per_task = job_specs->cpus_per_task; detail_ptr->orig_cpus_per_task = job_specs->cpus_per_task; } } if (error_code != SLURM_SUCCESS) goto fini; /* Reset min and max node counts as needed, ensure consistency */ if (job_specs->min_nodes != NO_VAL) { if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) ; /* shrink running job, processed later */ else if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else if (job_specs->min_nodes < 1) { info("%s: min_nodes < 1 for %pJ", __func__, job_ptr); error_code = ESLURM_INVALID_NODE_COUNT; } else { /* Resize of pending job */ save_min_nodes = detail_ptr->min_nodes; detail_ptr->min_nodes = job_specs->min_nodes; } } if (job_specs->max_nodes != NO_VAL) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else { save_max_nodes = detail_ptr->max_nodes; detail_ptr->max_nodes = job_specs->max_nodes; } } if ((save_min_nodes || save_max_nodes) && detail_ptr->max_nodes && (detail_ptr->max_nodes < detail_ptr->min_nodes)) { info("%s: max_nodes < min_nodes (%u < %u) for %pJ", __func__, detail_ptr->max_nodes, detail_ptr->min_nodes, job_ptr); error_code = ESLURM_INVALID_NODE_COUNT; if (save_min_nodes) { detail_ptr->min_nodes = save_min_nodes; save_min_nodes = 0; } if (save_max_nodes) { detail_ptr->max_nodes = save_max_nodes; save_max_nodes = 0; } } if (error_code != SLURM_SUCCESS) goto fini; if (save_min_nodes && (save_min_nodes!= detail_ptr->min_nodes)) { info("%s: setting min_nodes from %u to %u for %pJ", __func__, save_min_nodes, detail_ptr->min_nodes, job_ptr); job_ptr->limit_set.tres[TRES_ARRAY_NODE] = acct_policy_limit_set.tres[TRES_ARRAY_NODE]; update_accounting = true; } if (save_max_nodes && (save_max_nodes != detail_ptr->max_nodes)) { info("%s: setting max_nodes from %u to %u for %pJ", __func__, save_max_nodes, detail_ptr->max_nodes, job_ptr); /* * Always use the acct_policy_limit_set.* since if set by a * super user it be set correctly */ job_ptr->limit_set.tres[TRES_ARRAY_NODE] = acct_policy_limit_set.tres[TRES_ARRAY_NODE]; update_accounting = true; } if (job_specs->num_tasks != NO_VAL) { if (!IS_JOB_PENDING(job_ptr)) error_code = ESLURM_JOB_NOT_PENDING; else if (job_specs->num_tasks < 1) error_code = ESLURM_BAD_TASK_COUNT; else { detail_ptr->num_tasks = job_specs->num_tasks; info("%s: setting num_tasks to %u for %pJ", __func__, job_specs->num_tasks, job_ptr); } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->time_limit != NO_VAL) { if (IS_JOB_FINISHED(job_ptr) || job_ptr->preempt_time) error_code = ESLURM_JOB_FINISHED; else if (job_ptr->time_limit == job_specs->time_limit) { sched_debug("%s: new time limit identical to old time limit %pJ", __func__, job_ptr); } else if (operator || (job_ptr->time_limit > job_specs->time_limit)) { time_t old_time = job_ptr->time_limit; uint32_t use_time_min = job_specs->time_min != NO_VAL ? job_specs->time_min : job_ptr->time_min; if (old_time == INFINITE) /* one year in mins */ old_time = (365 * 24 * 60); if (job_specs->time_limit < use_time_min) { sched_info("%s: attempt to set time_limit < time_min (%u < %u)", __func__, job_specs->time_limit, use_time_min); error_code = ESLURM_INVALID_TIME_MIN_LIMIT; goto fini; } acct_policy_alter_job(job_ptr, job_specs->time_limit); job_ptr->time_limit = job_specs->time_limit; if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { if (job_ptr->preempt_time) { ; /* Preemption in progress */ } else if (job_ptr->time_limit == INFINITE) { /* Set end time in one year */ job_ptr->end_time = now + (365 * 24 * 60 * 60); } else { /* * Update end_time based upon change * to preserve suspend time info */ job_ptr->end_time = job_ptr->end_time + ((job_ptr->time_limit - old_time) * 60); } if (job_ptr->end_time < now) job_ptr->end_time = now; if (IS_JOB_RUNNING(job_ptr) && (list_is_empty(job_ptr->step_list) == 0)) { _xmit_new_end_time(job_ptr); } job_ptr->end_time_exp = job_ptr->end_time; } sched_info("%s: setting time_limit to %u for %pJ", __func__, job_specs->time_limit, job_ptr); /* * Always use the acct_policy_limit_set.* * since if set by a super user it be set correctly */ job_ptr->limit_set.time = acct_policy_limit_set.time; update_accounting = true; } else if (IS_JOB_PENDING(job_ptr) && job_ptr->part_ptr && (job_ptr->part_ptr->max_time >= job_specs->time_limit)) { job_ptr->time_limit = job_specs->time_limit; sched_info("%s: setting time_limit to %u for %pJ", __func__, job_specs->time_limit, job_ptr); /* * Always use the acct_policy_limit_set.* * since if set by a super user it be set correctly */ job_ptr->limit_set.time = acct_policy_limit_set.time; update_accounting = true; } else { sched_info("%s: Attempt to increase time limit for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } } if (error_code != SLURM_SUCCESS) goto fini; if ((job_specs->time_min != NO_VAL) && IS_JOB_PENDING(job_ptr)) { if (job_specs->time_min > job_ptr->time_limit) { info("%s: attempt to set TimeMin > TimeLimit (%u > %u)", __func__, job_specs->time_min, job_ptr->time_limit); error_code = ESLURM_INVALID_TIME_MIN_LIMIT; } else if (job_ptr->time_min != job_specs->time_min) { job_ptr->time_min = job_specs->time_min; info("%s: setting TimeMin to %u for %pJ", __func__, job_specs->time_min, job_ptr); } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->end_time) { if (!IS_JOB_RUNNING(job_ptr) || job_ptr->preempt_time) { /* * We may want to use this for deadline scheduling * at some point in the future. For now only reset * the time limit of running jobs. */ error_code = ESLURM_JOB_NOT_RUNNING; } else if (job_specs->end_time < now) { error_code = ESLURM_INVALID_TIME_VALUE; } else if (operator || (job_ptr->end_time > job_specs->end_time)) { int delta_t = job_specs->end_time - job_ptr->end_time; job_ptr->end_time = job_specs->end_time; job_ptr->time_limit += (delta_t+30)/60; /* Sec->min */ sched_info("%s: setting time_limit to %u for %pJ", __func__, job_ptr->time_limit, job_ptr); /* Always use the acct_policy_limit_set.* * since if set by a super user it be set correctly */ job_ptr->limit_set.time = acct_policy_limit_set.time; update_accounting = true; } else { sched_info("%s: Attempt to extend end time for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } } if ((job_specs->deadline) && (!IS_JOB_RUNNING(job_ptr))) { char time_str[32]; slurm_make_time_str(&job_ptr->deadline, time_str, sizeof(time_str)); if (job_specs->deadline < now) { error_code = ESLURM_INVALID_TIME_VALUE; } else if (operator) { /* update deadline */ job_ptr->deadline = job_specs->deadline; sched_info("%s: setting deadline to %s for %pJ", __func__, time_str, job_ptr); /* * Always use the acct_policy_limit_set.* * since if set by a super user it be set correctly */ job_ptr->limit_set.time = acct_policy_limit_set.time; update_accounting = true; } else { sched_info("%s: Attempt to extend end time for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->delay_boot != NO_VAL) { job_ptr->delay_boot = job_specs->delay_boot; sched_info("%s: setting delay_boot to %u for %pJ", __func__, job_specs->delay_boot, job_ptr); } if ((job_specs->requeue != NO_VAL16) && detail_ptr) { detail_ptr->requeue = MIN(job_specs->requeue, 1); sched_info("%s: setting requeue to %u for %pJ", __func__, job_specs->requeue, job_ptr); } if (job_specs->priority != NO_VAL) { /* * If we are doing time slicing we could update the * priority of the job while running to give better * position (larger time slices) than competing jobs */ if (IS_JOB_FINISHED(job_ptr) || (detail_ptr == NULL)) error_code = ESLURM_JOB_FINISHED; else if (job_ptr->priority == job_specs->priority) { debug("%s: setting priority to current value",__func__); if ((job_ptr->priority == 0) && operator) { /* * Authorized user can change from user hold * to admin hold or admin hold to user hold */ if (job_specs->alloc_sid == ALLOC_SID_USER_HOLD) job_ptr->state_reason = WAIT_HELD_USER; else job_ptr->state_reason = WAIT_HELD; } } else if ((job_ptr->priority == 0) && (job_specs->priority == INFINITE) && (operator || (job_ptr->state_reason == WAIT_RESV_DELETED) || (job_ptr->state_reason == WAIT_HELD_USER))) { _release_job(job_ptr, uid); } else if ((job_ptr->priority == 0) && (job_specs->priority != INFINITE)) { info("%s: ignore priority reset request on held %pJ", __func__, job_ptr); error_code = ESLURM_JOB_HELD; } else if (operator || (job_ptr->priority > job_specs->priority)) { if (job_specs->priority != 0) job_ptr->details->nice = NICE_OFFSET; if (job_specs->priority == INFINITE) { job_ptr->direct_set_prio = 0; set_job_prio(job_ptr); } else if (job_specs->priority == 0) { _hold_job(job_ptr, uid); } else { if (operator) { /* * Only administrator can make * persistent change to a job's * priority, except holding a job */ job_ptr->direct_set_prio = 1; } else error_code = ESLURM_PRIO_RESET_FAIL; job_ptr->priority = job_specs->priority; if (job_ptr->part_ptr_list && job_ptr->priority_array) { int i, j = list_count( job_ptr->part_ptr_list); for (i = 0; i < j; i++) { job_ptr->priority_array[i] = job_specs->priority; } } } sched_info("%s: set priority to %u for %pJ", __func__, job_ptr->priority, job_ptr); update_accounting = true; if (job_ptr->priority == 0) { if (!operator || (job_specs->alloc_sid == ALLOC_SID_USER_HOLD)) { job_ptr->state_reason = WAIT_HELD_USER; } else job_ptr->state_reason = WAIT_HELD; xfree(job_ptr->state_desc); /* remove pending remote sibling jobs */ if (IS_JOB_PENDING(job_ptr) && !IS_JOB_REVOKED(job_ptr)) { fed_mgr_job_revoke_sibs(job_ptr); } } } else if ((job_ptr->priority != 0) && (job_specs->priority == INFINITE)) { /* * If the job was already released, ignore another * release request. */ debug("%s: %pJ already released, ignoring request", __func__, job_ptr); } else { sched_error("Attempt to modify priority for %pJ", job_ptr); error_code = ESLURM_ACCESS_DENIED; } } else if (job_ptr->state_reason == FAIL_BAD_CONSTRAINTS) { /* * We need to check if the state is BadConstraints here since we * are altering the job the bad constraint might have gone * away. If it did the priority (0) wouldn't get reset so the * job would just go into JobAdminHeld otherwise. */ job_ptr->direct_set_prio = 0; set_job_prio(job_ptr); sched_debug("%s: job request changed somehow, removing the bad constraints to reevaluate %pJ uid %u", __func__, job_ptr, uid); job_ptr->state_reason = WAIT_NO_REASON; } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->nice != NO_VAL) { if (IS_JOB_FINISHED(job_ptr) || (job_ptr->details == NULL)) error_code = ESLURM_JOB_FINISHED; else if (job_ptr->details && (job_ptr->details->nice == job_specs->nice)) sched_debug("%s: new nice identical to old nice %pJ", __func__, job_ptr); else if (job_ptr->direct_set_prio && job_ptr->priority != 0) info("%s: ignore nice set request on %pJ", __func__, job_ptr); else if (operator || (job_specs->nice >= NICE_OFFSET)) { if (!xstrcmp(slurmctld_conf.priority_type, "priority/basic")) { int64_t new_prio = job_ptr->priority; new_prio += job_ptr->details->nice; new_prio -= job_specs->nice; job_ptr->priority = MAX(new_prio, 2); sched_info("%s: nice changed from %u to %u, setting priority to %u for %pJ", __func__, job_ptr->details->nice, job_specs->nice, job_ptr->priority, job_ptr); } job_ptr->details->nice = job_specs->nice; update_accounting = true; } else { sched_error("%s: Attempt to modify nice for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->pn_min_memory != NO_VAL64) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; } else if (job_specs->pn_min_memory == detail_ptr->pn_min_memory) { sched_debug("%s: new memory limit identical to old limit for %pJ", __func__, job_ptr); } else { char *entity; if (job_specs->pn_min_memory == MEM_PER_CPU) { /* Map --mem-per-cpu=0 to --mem=0 */ job_specs->pn_min_memory = 0; } if (job_specs->pn_min_memory & MEM_PER_CPU) entity = "cpu"; else entity = "job"; detail_ptr->pn_min_memory = job_specs->pn_min_memory; detail_ptr->orig_pn_min_memory = job_specs->pn_min_memory; job_ptr->bit_flags |= JOB_MEM_SET; sched_info("%s: setting min_memory_%s to %"PRIu64" for %pJ", __func__, entity, (job_specs->pn_min_memory & (~MEM_PER_CPU)), job_ptr); /* * Always use the acct_policy_limit_set.* * since if set by a super user it be set correctly */ job_ptr->limit_set.tres[TRES_ARRAY_MEM] = acct_policy_limit_set.tres[TRES_ARRAY_MEM]; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->pn_min_tmp_disk != NO_VAL) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; } else { detail_ptr->pn_min_tmp_disk = job_specs->pn_min_tmp_disk; sched_info("%s: setting job_min_tmp_disk to %u for %pJ", __func__, job_specs->pn_min_tmp_disk, job_ptr); } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->sockets_per_node != NO_VAL16) { if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } else { mc_ptr->sockets_per_node = job_specs->sockets_per_node; sched_info("%s: setting sockets_per_node to %u for %pJ", __func__, job_specs->sockets_per_node, job_ptr); } } if (job_specs->cores_per_socket != NO_VAL16) { if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } else { mc_ptr->cores_per_socket = job_specs->cores_per_socket; sched_info("%s: setting cores_per_socket to %u for %pJ", __func__, job_specs->cores_per_socket, job_ptr); } } if ((job_specs->threads_per_core != NO_VAL16)) { if ((!IS_JOB_PENDING(job_ptr)) || (mc_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } else { mc_ptr->threads_per_core = job_specs->threads_per_core; sched_info("%s: setting threads_per_core to %u for %pJ", __func__, job_specs->threads_per_core, job_ptr); } } if (job_specs->shared != NO_VAL16) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; } else if (!operator) { sched_error("%s: Attempt to change sharing for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } else { if (job_specs->shared) { detail_ptr->share_res = 1; detail_ptr->whole_node = 0; } else { detail_ptr->share_res = 0; } sched_info("%s: setting shared to %u for %pJ", __func__, job_specs->shared, job_ptr); } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->contiguous != NO_VAL16) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else if (operator || (detail_ptr->contiguous > job_specs->contiguous)) { detail_ptr->contiguous = job_specs->contiguous; sched_info("%s: setting contiguous to %u for %pJ", __func__, job_specs->contiguous, job_ptr); } else { sched_error("%s: Attempt to add contiguous for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->core_spec != NO_VAL16) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else if (operator && (slurmctld_conf.conf_flags & CTL_CONF_ASRU)) { if (job_specs->core_spec == INFINITE16) detail_ptr->core_spec = NO_VAL16; else detail_ptr->core_spec = job_specs->core_spec; sched_info("%s: setting core_spec to %u for %pJ", __func__, detail_ptr->core_spec, job_ptr); if (detail_ptr->core_spec != NO_VAL16) detail_ptr->whole_node = 1; } else { sched_error("%s Attempt to modify core_spec for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->features && detail_ptr && !xstrcmp(job_specs->features, detail_ptr->features)) { sched_debug("%s: new features identical to old features %s", __func__, job_specs->features); } else if (job_specs->features) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else if (job_specs->features[0] != '\0') { char *old_features = detail_ptr->features; List old_list = detail_ptr->feature_list; detail_ptr->features = xstrdup(job_specs->features); detail_ptr->feature_list = NULL; if (build_feature_list(job_ptr)) { sched_info("%s: invalid features(%s) for %pJ", __func__, job_specs->features, job_ptr); FREE_NULL_LIST(detail_ptr->feature_list); detail_ptr->features = old_features; detail_ptr->feature_list = old_list; error_code = ESLURM_INVALID_FEATURE; } else { sched_info("%s: setting features to %s for %pJ", __func__, job_specs->features, job_ptr); xfree(old_features); FREE_NULL_LIST(old_list); } } else { sched_info("%s: cleared features for %pJ", __func__, job_ptr); xfree(detail_ptr->features); FREE_NULL_LIST(detail_ptr->feature_list); } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->cluster_features && (error_code = fed_mgr_update_job_cluster_features( job_ptr, job_specs->cluster_features))) goto fini; if (job_specs->clusters && (error_code = fed_mgr_update_job_clusters(job_ptr, job_specs->clusters))) goto fini; if (gres_list) { char *tmp = NULL; if (job_specs->cpus_per_tres) { xstrfmtcat(tmp, "cpus_per_tres:%s ", job_specs->cpus_per_tres); xfree(job_ptr->cpus_per_tres); job_ptr->cpus_per_tres = job_specs->cpus_per_tres; job_specs->cpus_per_tres = NULL; } if (job_specs->tres_per_job) { xstrfmtcat(tmp, "tres_per_job:%s ", job_specs->tres_per_job); xfree(job_ptr->tres_per_job); job_ptr->tres_per_job = job_specs->tres_per_job; job_specs->tres_per_job = NULL; } if (job_specs->tres_per_node) { xstrfmtcat(tmp, "tres_per_node:%s ", job_specs->tres_per_node); xfree(job_ptr->tres_per_node); job_ptr->tres_per_node = job_specs->tres_per_node; job_specs->tres_per_node = NULL; } if (job_specs->tres_per_socket) { xstrfmtcat(tmp, "tres_per_socket:%s ", job_specs->tres_per_socket); xfree(job_ptr->tres_per_socket); job_ptr->tres_per_socket = job_specs->tres_per_socket; job_specs->tres_per_socket = NULL; } if (job_specs->tres_per_task) { xstrfmtcat(tmp, "tres_per_task:%s ", job_specs->tres_per_task); xfree(job_ptr->tres_per_task); job_ptr->tres_per_task = job_specs->tres_per_task; job_specs->tres_per_task = NULL; } if (job_specs->mem_per_tres) { xstrfmtcat(tmp, "mem_per_tres:%s ", job_specs->mem_per_tres); xfree(job_ptr->mem_per_tres); job_ptr->mem_per_tres = job_specs->mem_per_tres; job_specs->mem_per_tres = NULL; } sched_info("%s: setting %sfor %pJ", __func__, tmp, job_ptr); xfree(tmp); FREE_NULL_LIST(job_ptr->gres_list); job_ptr->gres_list = gres_list; gres_build_job_details(job_ptr->gres_list, &job_ptr->gres_detail_cnt, &job_ptr->gres_detail_str, &job_ptr->gres_used); gres_list = NULL; } if (job_specs->name) { if (IS_JOB_FINISHED(job_ptr)) { error_code = ESLURM_JOB_FINISHED; goto fini; } else if (!xstrcmp(job_specs->name, job_ptr->name)) { sched_debug("%s: new name identical to old name %pJ", __func__, job_ptr); } else { xfree(job_ptr->name); job_ptr->name = xstrdup(job_specs->name); sched_info("%s: setting name to %s for %pJ", __func__, job_ptr->name, job_ptr); update_accounting = true; } } if (job_specs->work_dir && detail_ptr && !xstrcmp(job_specs->work_dir, detail_ptr->work_dir)) { sched_debug("%s: new work_dir identical to old work_dir %s", __func__, job_specs->work_dir); } else if (job_specs->work_dir) { if (!IS_JOB_PENDING(job_ptr)) { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } else if (detail_ptr) { xfree(detail_ptr->work_dir); detail_ptr->work_dir = xstrdup(job_specs->work_dir); sched_info("%s: setting work_dir to %s for %pJ", __func__, detail_ptr->work_dir, job_ptr); update_accounting = true; } } if (job_specs->std_out && detail_ptr && !xstrcmp(job_specs->std_out, detail_ptr->std_out)) { sched_debug("%s: new std_out identical to old std_out %s", __func__, job_specs->std_out); } else if (job_specs->std_out) { if (!IS_JOB_PENDING(job_ptr)) error_code = ESLURM_JOB_NOT_PENDING; else if (detail_ptr) { xfree(detail_ptr->std_out); detail_ptr->std_out = xstrdup(job_specs->std_out); } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->wckey && !xstrcmp(job_specs->wckey, job_ptr->wckey)) { sched_debug("%s: new wckey identical to old wckey %pJ", __func__, job_ptr); } else if (job_specs->wckey) { if (!IS_JOB_PENDING(job_ptr)) error_code = ESLURM_JOB_NOT_PENDING; else { int rc = update_job_wckey((char *) __func__, job_ptr, job_specs->wckey); if (rc != SLURM_SUCCESS) error_code = rc; else update_accounting = true; } } if (error_code != SLURM_SUCCESS) goto fini; if ((job_specs->min_nodes != NO_VAL) && (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))) { /* * Use req_nodes to change the nodes associated with a running * for lack of other field in the job request to use */ if ((job_specs->min_nodes == 0) && (job_ptr->node_cnt > 0) && job_ptr->details && job_ptr->details->expanding_jobid) { job_record_t *expand_job_ptr; bitstr_t *orig_job_node_bitmap, *orig_jobx_node_bitmap; expand_job_ptr = find_job_record(job_ptr->details-> expanding_jobid); if (expand_job_ptr == NULL) { info("%s: Invalid node count (%u) for %pJ update, JobId=%u to expand not found", __func__, job_specs->min_nodes, job_ptr, job_ptr->details->expanding_jobid); error_code = ESLURM_INVALID_JOB_ID; goto fini; } if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_SUSPENDED(expand_job_ptr)) { info("%s: Can not expand %pJ from %pJ, job is suspended", __func__, expand_job_ptr, job_ptr); error_code = ESLURM_JOB_SUSPENDED; goto fini; } if ((job_ptr->step_list != NULL) && (list_count(job_ptr->step_list) != 0)) { info("%s: Attempt to merge %pJ with active steps into %pJ", __func__, job_ptr, expand_job_ptr); error_code = ESLURMD_STEP_EXISTS; goto fini; } sched_info("%s: killing %pJ and moving all resources to %pJ", __func__, job_ptr, expand_job_ptr); job_pre_resize_acctg(job_ptr); job_pre_resize_acctg(expand_job_ptr); _send_job_kill(job_ptr); xassert(job_ptr->job_resrcs); xassert(job_ptr->job_resrcs->node_bitmap); xassert(expand_job_ptr->job_resrcs->node_bitmap); orig_job_node_bitmap = bit_copy(job_ptr->node_bitmap); orig_jobx_node_bitmap = bit_copy(expand_job_ptr-> job_resrcs-> node_bitmap); error_code = select_g_job_expand(job_ptr, expand_job_ptr); if (error_code == SLURM_SUCCESS) { _merge_job_licenses(job_ptr, expand_job_ptr); FREE_NULL_BITMAP(job_ptr->node_bitmap); job_ptr->node_bitmap = orig_job_node_bitmap; orig_job_node_bitmap = NULL; deallocate_nodes(job_ptr, false, false, false); bit_clear_all(job_ptr->node_bitmap); job_ptr->job_state &= JOB_STATE_FLAGS; job_ptr->job_state |= JOB_COMPLETE; _realloc_nodes(expand_job_ptr, orig_jobx_node_bitmap); rebuild_step_bitmaps(expand_job_ptr, orig_jobx_node_bitmap); (void) gs_job_fini(job_ptr); (void) gs_job_start(expand_job_ptr); } FREE_NULL_BITMAP(orig_job_node_bitmap); FREE_NULL_BITMAP(orig_jobx_node_bitmap); job_post_resize_acctg(job_ptr); job_post_resize_acctg(expand_job_ptr); /* * Since job_post_resize_acctg will restart things, * don't do it again. */ update_accounting = false; if (error_code) goto fini; } else if ((job_specs->min_nodes == 0) || (job_specs->min_nodes > job_ptr->node_cnt) || job_ptr->details->expanding_jobid) { sched_info("%s: Invalid node count (%u) for %pJ update", __func__, job_specs->min_nodes, job_ptr); error_code = ESLURM_INVALID_NODE_COUNT; goto fini; } else if (job_specs->min_nodes == job_ptr->node_cnt) { debug2("%s: No change in node count update for %pJ", __func__, job_ptr); } else if (!permit_job_shrink()) { error("%s: request to shrink %pJ denied by configuration", __func__, job_ptr); error_code = ESLURM_NOT_SUPPORTED; goto fini; } else { int i, i_first, i_last, total = 0; node_record_t *node_ptr; bitstr_t *rem_nodes, *tmp_nodes; sched_info("%s: set node count to %u for %pJ", __func__, job_specs->min_nodes, job_ptr); job_pre_resize_acctg(job_ptr); /* * Don't remove the batch host from the job. The batch * host isn't guaranteed to be the first bit set in * job_ptr->node_bitmap because the batch host can be * selected with the --batch and --constraint sbatch * flags. */ tmp_nodes = bit_copy(job_ptr->node_bitmap); if (job_ptr->batch_host) { bitstr_t *batch_host_bitmap; if (node_name2bitmap(job_ptr->batch_host, false, &batch_host_bitmap)) error("%s: Invalid batch host %s for %pJ; this should never happen", __func__, job_ptr->batch_host, job_ptr); else { bit_and_not(tmp_nodes, batch_host_bitmap); bit_free(batch_host_bitmap); /* * Set total to 1 since we're * guaranteeing that we won't remove the * batch host. */ total = 1; } } i_first = bit_ffs(tmp_nodes); if (i_first >= 0) i_last = bit_fls(tmp_nodes); else i_last = -2; rem_nodes = bit_alloc(bit_size(tmp_nodes)); for (i = i_first; i <= i_last; i++) { if (!bit_test(tmp_nodes, i)) continue; if (++total <= job_specs->min_nodes) continue; bit_set(rem_nodes, i); } #ifndef HAVE_FRONT_END abort_job_on_nodes(job_ptr, rem_nodes); #endif for (i = i_first, total = 0; i <= i_last; i++) { if (!bit_test(rem_nodes, i)) continue; node_ptr = node_record_table_ptr + i; kill_step_on_node(job_ptr, node_ptr, false); excise_node_from_job(job_ptr, node_ptr); } bit_free(rem_nodes); bit_free(tmp_nodes); (void) gs_job_start(job_ptr); job_post_resize_acctg(job_ptr); sched_info("%s: set nodes to %s for %pJ", __func__, job_ptr->nodes, job_ptr); /* * Since job_post_resize_acctg() will restart * things don't do it again. */ update_accounting = false; } gres_build_job_details(job_ptr->gres_list, &job_ptr->gres_detail_cnt, &job_ptr->gres_detail_str, &job_ptr->gres_used); } if (job_specs->ntasks_per_node != NO_VAL16) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL)) error_code = ESLURM_JOB_NOT_PENDING; else if (operator) { detail_ptr->ntasks_per_node = job_specs->ntasks_per_node; sched_info("%s: setting ntasks_per_node to %u for %pJ", __func__, job_specs->ntasks_per_node, job_ptr); } else { sched_error("%s: Not super user: ignore ntasks_per_node change for job %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->ntasks_per_socket != NO_VAL16) { if ((!IS_JOB_PENDING(job_ptr)) || (detail_ptr == NULL) || (detail_ptr->mc_ptr == NULL)) { error_code = ESLURM_JOB_NOT_PENDING; } else if (operator) { detail_ptr->mc_ptr->ntasks_per_socket = job_specs->ntasks_per_socket; sched_info("%s: setting ntasks_per_socket to %u for %pJ", __func__, job_specs->ntasks_per_socket, job_ptr); } else { sched_error("%s: Not super user: ignore ntasks_per_socket change for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->dependency) { /* Can't update dependency of revoked job */ if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL) || IS_JOB_REVOKED(job_ptr)) error_code = ESLURM_JOB_NOT_PENDING; else if (!fed_mgr_is_origin_job(job_ptr)) { /* * If the job became independent because of a dependency * update, that job gets requeued on siblings and then * the dependency update gets sent to siblings. So we * silently ignore this update on the sibling. */ } else { int rc; rc = update_job_dependency(job_ptr, job_specs->dependency); if (rc != SLURM_SUCCESS) error_code = rc; /* * Because dependencies updated and we don't know where * they used to be, send dependencies to all siblings * so the siblings can update their dependency list. */ else { rc = fed_mgr_submit_remote_dependencies(job_ptr, true, false); if (rc) { error("%s: %pJ Failed to send remote dependencies to some or all siblings.", __func__, job_ptr); error_code = rc; } /* * Even if we fail to send remote dependencies, * we already succeeded in updating the job's * dependency locally, so we still need to * do these things. */ job_ptr->details->orig_dependency = xstrdup(job_ptr->details->dependency); sched_info("%s: setting dependency to %s for %pJ", __func__, job_ptr->details->dependency, job_ptr); /* * If the job isn't independent, remove pending * remote sibling jobs */ if (!job_independent(job_ptr)) fed_mgr_job_revoke_sibs(job_ptr); } } } if (error_code != SLURM_SUCCESS) goto fini; if (job_specs->begin_time) { if (IS_JOB_PENDING(job_ptr) && detail_ptr) { char time_str[32]; /* * Make sure this time is current, it does no good for * accounting to say this job could have started before * now */ if (job_specs->begin_time < now) job_specs->begin_time = now; if (detail_ptr->begin_time != job_specs->begin_time) { detail_ptr->begin_time = job_specs->begin_time; update_accounting = true; slurm_make_time_str(&detail_ptr->begin_time, time_str, sizeof(time_str)); sched_info("%s: setting begin to %s for %pJ", __func__, time_str, job_ptr); acct_policy_remove_accrue_time(job_ptr, false); } else sched_debug("%s: new begin time identical to old begin time %pJ", __func__, job_ptr); } else { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } } if (job_specs->licenses) { if (IS_JOB_PENDING(job_ptr)) { FREE_NULL_LIST(job_ptr->license_list); job_ptr->license_list = license_list; license_list = NULL; sched_info("%s: changing licenses from '%s' to '%s' for pending %pJ", __func__, job_ptr->licenses, job_specs->licenses, job_ptr); xfree(job_ptr->licenses); job_ptr->licenses = xstrdup(job_specs->licenses); } else if (IS_JOB_RUNNING(job_ptr)) { /* * Operators can modify license counts on running jobs, * regular users can only completely remove license * counts on running jobs. */ if (!operator && license_list) { sched_error("%s: Not operator user: ignore licenses change for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; goto fini; } /* * NOTE: This can result in oversubscription of * licenses */ license_job_return(job_ptr); FREE_NULL_LIST(job_ptr->license_list); job_ptr->license_list = license_list; license_list = NULL; sched_info("%s: changing licenses from '%s' to '%s' for running %pJ", __func__, job_ptr->licenses, job_specs->licenses, job_ptr); xfree(job_ptr->licenses); job_ptr->licenses = xstrdup(job_specs->licenses); license_job_get(job_ptr); } else { /* * licenses are valid, but job state or user not * allowed to make changes */ sched_info("%s: could not change licenses for %pJ", __func__, job_ptr); error_code = ESLURM_JOB_NOT_PENDING_NOR_RUNNING; FREE_NULL_LIST(license_list); } update_accounting = true; } if (error_code != SLURM_SUCCESS) goto fini; fail_reason = job_limits_check(&job_ptr, false); if (fail_reason != WAIT_NO_REASON) { if (fail_reason == WAIT_QOS_THRES) error_code = ESLURM_QOS_THRES; else if ((fail_reason == WAIT_PART_TIME_LIMIT) || (fail_reason == WAIT_PART_NODE_LIMIT) || (fail_reason == WAIT_PART_DOWN) || (fail_reason == WAIT_HELD)) error_code = SLURM_SUCCESS; else error_code = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; if (error_code != SLURM_SUCCESS) { if ((job_ptr->state_reason != WAIT_HELD) && (job_ptr->state_reason != WAIT_HELD_USER) && (job_ptr->state_reason != WAIT_RESV_DELETED)) { job_ptr->state_reason = fail_reason; xfree(job_ptr->state_desc); } goto fini; } } else if ((job_ptr->state_reason != WAIT_HELD) && (job_ptr->state_reason != WAIT_HELD_USER) && (job_ptr->state_reason != WAIT_RESV_DELETED) /* * A job update can come while the prolog is running. * Don't change state_reason if the prolog is running. * _is_prolog_finished() relies on state_reason==WAIT_PROLOG * to know if the prolog is running. If we change it here, * then slurmctld will think that the prolog isn't running * anymore and _slurm_rpc_job_ready will tell srun that the * prolog is done even if it isn't. Then srun can launch a * job step before the prolog is done, which breaks the * behavior of PrologFlags=alloc and means that the job step * could launch before the extern step sets up x11. */ && (job_ptr->state_reason != WAIT_PROLOG) && (job_ptr->state_reason != WAIT_MAX_REQUEUE)) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); } if (job_specs->reboot != NO_VAL16) { if (!validate_super_user(uid)) { error("%s: Attempt to change reboot for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; } else if (!IS_JOB_PENDING(job_ptr)) { error_code = ESLURM_JOB_NOT_PENDING; goto fini; } else { sched_info("%s: setting reboot to %u for %pJ", __func__, job_specs->reboot, job_ptr); if (job_specs->reboot == 0) job_ptr->reboot = 0; else job_ptr->reboot = MAX(1, job_specs->reboot); } } if (job_specs->network && !xstrcmp(job_specs->network, job_ptr->network)) { sched_debug("%s: new network identical to old network %s", __func__, job_ptr->network); } else if (job_specs->network) { xfree(job_ptr->network); if (!strlen(job_specs->network) || !xstrcmp(job_specs->network, "none")) { sched_info("%s: clearing Network option for %pJ", __func__, job_ptr); } else { job_ptr->network = xstrdup(job_specs->network); sched_info("%s: setting Network to %s for %pJ", __func__, job_ptr->network, job_ptr); select_g_select_jobinfo_set( job_ptr->select_jobinfo, SELECT_JOBDATA_NETWORK, job_ptr->network); } } if (job_specs->fed_siblings_viable) { if (!job_ptr->fed_details) { error_code = ESLURM_JOB_NOT_FEDERATED; goto fini; } info("%s: setting fed_siblings from %"PRIu64" to %"PRIu64" for %pJ", __func__, job_ptr->fed_details->siblings_viable, job_specs->fed_siblings_viable, job_ptr); job_ptr->fed_details->siblings_viable = job_specs->fed_siblings_viable; update_job_fed_details(job_ptr); } if (job_specs->cpus_per_tres) { if (!valid_tres_cnt(job_specs->cpus_per_tres)) { error_code = ESLURM_INVALID_TRES; goto fini; } xfree(job_ptr->cpus_per_tres); if (!strlen(job_specs->cpus_per_tres)) { sched_info("%s: clearing CpusPerTres option for %pJ", __func__, job_ptr); } else { job_ptr->cpus_per_tres = xstrdup(job_specs->cpus_per_tres); sched_info("%s: setting CpusPerTres to %s for %pJ", __func__, job_ptr->cpus_per_tres, job_ptr); } } if (job_specs->mem_per_tres) { if (!valid_tres_cnt(job_specs->mem_per_tres)) { error_code = ESLURM_INVALID_TRES; goto fini; } xfree(job_ptr->mem_per_tres); if (!strlen(job_specs->mem_per_tres)) { sched_info("%s: clearing MemPerTres option for %pJ", __func__, job_ptr); } else { job_ptr->mem_per_tres = xstrdup(job_specs->mem_per_tres); sched_info("%s: setting MemPerTres to %s for %pJ", __func__, job_ptr->mem_per_tres, job_ptr); } } if (job_specs->tres_bind) { if (tres_bind_verify_cmdline(job_specs->tres_bind)) { error_code = ESLURM_INVALID_TRES; goto fini; } xfree(job_ptr->tres_bind); if (!strlen(job_specs->tres_bind)) { sched_info("%s: clearing TresBind option for %pJ", __func__, job_ptr); } else { job_ptr->tres_bind = xstrdup(job_specs->tres_bind); sched_info("%s: setting TresBind to %s for %pJ", __func__, job_ptr->tres_bind, job_ptr); } } if (job_specs->tres_freq) { if (tres_freq_verify_cmdline(job_specs->tres_freq)) { error_code = ESLURM_INVALID_TRES; goto fini; } xfree(job_ptr->tres_freq); if (!strlen(job_specs->tres_freq)) { sched_info("%s: clearing TresFreq option for %pJ", __func__, job_ptr); } else { job_ptr->tres_freq = xstrdup(job_specs->tres_freq); sched_info("%s: setting TresFreq to %s for %pJ", __func__, job_ptr->tres_freq, job_ptr); } } if (job_specs->tres_per_job) { if (!valid_tres_cnt(job_specs->tres_per_job)) { error_code = ESLURM_INVALID_TRES; goto fini; } xfree(job_ptr->tres_per_job); if (!strlen(job_specs->tres_per_job)) { sched_info("%s: clearing TresPerJob option for %pJ", __func__, job_ptr); } else { job_ptr->tres_per_job = xstrdup(job_specs->tres_per_job); sched_info("%s: setting TresPerJob to %s for %pJ", __func__, job_ptr->tres_per_job, job_ptr); } } if (job_specs->tres_per_node) { if (!valid_tres_cnt(job_specs->tres_per_node)) { error_code = ESLURM_INVALID_TRES; goto fini; } xfree(job_ptr->tres_per_node); if (!strlen(job_specs->tres_per_node)) { sched_info("%s: clearing TresPerNode option for %pJ", __func__, job_ptr); } else { job_ptr->tres_per_node = xstrdup(job_specs->tres_per_node); sched_info("%s: setting TresPerNode to %s for %pJ", __func__, job_ptr->tres_per_node, job_ptr); } } if (job_specs->tres_per_socket) { if (!valid_tres_cnt(job_specs->tres_per_socket)) { error_code = ESLURM_INVALID_TRES; goto fini; } xfree(job_ptr->tres_per_socket); if (!strlen(job_specs->tres_per_socket)) { sched_info("%s: clearing TresPerSocket option for %pJ", __func__, job_ptr); } else { job_ptr->tres_per_socket = xstrdup(job_specs->tres_per_socket); sched_info("%s: setting TresPerSocket to %s for %pJ", __func__, job_ptr->tres_per_socket, job_ptr); } } if (job_specs->tres_per_task) { if (!valid_tres_cnt(job_specs->tres_per_task)) { error_code = ESLURM_INVALID_TRES; goto fini; } xfree(job_ptr->tres_per_task); if (!strlen(job_specs->tres_per_task)) { sched_info("%s: clearing TresPerTask option for %pJ", __func__, job_ptr); } else { job_ptr->tres_per_task = xstrdup(job_specs->tres_per_task); sched_info("%s: setting TresPerTask to %s for %pJ", __func__, job_ptr->tres_per_task, job_ptr); } } if (job_specs->mail_type != NO_VAL16) { job_ptr->mail_type = job_specs->mail_type; if (!job_specs->mail_user) { char *tmp = job_ptr->mail_user; if (job_ptr->mail_type) { job_ptr->mail_user = _get_mail_user(tmp, job_ptr->user_id); } xfree(tmp); } sched_info("%s: setting mail_type to %u for %pJ", __func__, job_ptr->mail_type, job_ptr); } if (job_specs->mail_user) { xfree(job_ptr->mail_user); if (job_ptr->mail_type) job_ptr->mail_user = _get_mail_user(job_specs->mail_user, job_ptr->user_id); sched_info("%s: setting mail_user to %s for %pJ", __func__, job_ptr->mail_user, job_ptr); } /* * The job submit plugin sets site_factor to NO_VAL before calling * the plugin to prevent the user from specifying it. */ if (user_site_factor != NO_VAL) { if (!operator) { error("%s: Attempt to change SiteFactor for %pJ", __func__, job_ptr); error_code = ESLURM_ACCESS_DENIED; job_specs->site_factor = NO_VAL; } else job_specs->site_factor = user_site_factor; } if (job_specs->site_factor != NO_VAL) { sched_info("%s: setting AdinPrioFactor to %u for %pJ", __func__, job_specs->site_factor, job_ptr); job_ptr->site_factor = job_specs->site_factor; } fini: FREE_NULL_BITMAP(new_req_bitmap); FREE_NULL_LIST(part_ptr_list); if ((error_code == SLURM_SUCCESS) && tres_req_cnt_set) { for (tres_pos = 0; tres_pos < slurmctld_tres_cnt; tres_pos++) { if (!tres_req_cnt[tres_pos] || (tres_req_cnt[tres_pos] == job_ptr->tres_req_cnt[tres_pos])) continue; job_ptr->tres_req_cnt[tres_pos] = tres_req_cnt[tres_pos]; tres_changed = true; } if (tres_changed) { job_ptr->tres_req_cnt[TRES_ARRAY_BILLING] = assoc_mgr_tres_weighted( job_ptr->tres_req_cnt, job_ptr->part_ptr->billing_weights, slurmctld_conf.priority_flags, false); set_job_tres_req_str(job_ptr, false); update_accounting = true; job_ptr->node_cnt_wag = 0; } } /* This was a local variable, so set it back to NULL */ job_specs->tres_req_cnt = NULL; FREE_NULL_LIST(gres_list); FREE_NULL_LIST(license_list); if (update_accounting) { info("%s: updating accounting", __func__); /* Update job record in accounting to reflect changes */ jobacct_storage_job_start_direct(acct_db_conn, job_ptr); } /* * If job isn't held recalculate the priority when not using * priority/basic. Since many factors of an update may affect priority * considerations. Do this whether or not the update was successful or * not. */ if ((job_ptr->priority != 0) && xstrcmp(slurmctld_conf.priority_type, "priority/basic")) set_job_prio(job_ptr); if ((error_code == SLURM_SUCCESS) && fed_mgr_fed_rec && job_ptr->fed_details && fed_mgr_is_origin_job(job_ptr)) { /* Send updates to sibling jobs */ /* Add the siblings_active to be updated. They could have been * updated if the job's ClusterFeatures were updated. */ job_specs->fed_siblings_viable = job_ptr->fed_details->siblings_viable; fed_mgr_update_job(job_ptr->job_id, job_specs, job_ptr->fed_details->siblings_active, uid); } return error_code; } /* * update_job - update a job's parameters per the supplied specifications * IN msg - RPC to update job, including change specification * IN uid - uid of user issuing RPC * IN send_msg - whether to send msg back or not * RET returns an error code from slurm_errno.h * global: job_list - global list of job entries * last_job_update - time of last job table update */ extern int update_job(slurm_msg_t *msg, uid_t uid, bool send_msg) { job_desc_msg_t *job_specs = (job_desc_msg_t *) msg->data; job_record_t *job_ptr; char *hostname = g_slurm_auth_get_host(msg->auth_cred); int rc; xfree(job_specs->job_id_str); xstrfmtcat(job_specs->job_id_str, "%u", job_specs->job_id); if (hostname) { xfree(job_specs->alloc_node); job_specs->alloc_node = hostname; } job_ptr = find_job_record(job_specs->job_id); if (job_ptr == NULL) { info("%s: JobId=%u does not exist", __func__, job_specs->job_id); rc = ESLURM_INVALID_JOB_ID; } else { if (job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) job_specs->array_bitmap = bit_copy(job_ptr->array_recs->task_id_bitmap); rc = _update_job(job_ptr, job_specs, uid); } if (send_msg && rc != ESLURM_JOB_SETTING_DB_INX) slurm_send_rc_msg(msg, rc); xfree(job_specs->job_id_str); return rc; } /* * IN msg - RPC to update job, including change specification * IN job_specs - a job's specification * IN uid - uid of user issuing RPC * RET returns an error code from slurm_errno.h * global: job_list - global list of job entries * last_job_update - time of last job table update */ extern int update_job_str(slurm_msg_t *msg, uid_t uid) { slurm_msg_t resp_msg; job_desc_msg_t *job_specs = (job_desc_msg_t *) msg->data; job_record_t *job_ptr, *new_job_ptr, *het_job; char *hostname = g_slurm_auth_get_host(msg->auth_cred); ListIterator iter; long int long_id; uint32_t job_id = 0, het_job_offset; bitstr_t *array_bitmap = NULL, *tmp_bitmap; bool valid = true; int32_t i, i_first, i_last; int len, rc = SLURM_SUCCESS, rc2; char *end_ptr, *tok, *tmp = NULL; char *job_id_str; resp_array_struct_t *resp_array = NULL; job_array_resp_msg_t *resp_array_msg = NULL; return_code_msg_t rc_msg; job_id_str = job_specs->job_id_str; if (hostname) { xfree(job_specs->alloc_node); job_specs->alloc_node = hostname; } if (max_array_size == NO_VAL) max_array_size = slurmctld_conf.max_array_sz; long_id = strtol(job_id_str, &end_ptr, 10); if ((long_id <= 0) || (long_id == LONG_MAX) || ((end_ptr[0] != '\0') && (end_ptr[0] != '_') && (end_ptr[0] != '+'))) { info("%s: invalid JobId=%s", __func__, job_id_str); rc = ESLURM_INVALID_JOB_ID; goto reply; } job_id = (uint32_t) long_id; if (end_ptr[0] == '\0') { /* Single job (or full job array) */ job_record_t *job_ptr_done = NULL; job_ptr = find_job_record(job_id); if (job_ptr && job_ptr->het_job_list) { iter = list_iterator_create(job_ptr->het_job_list); while ((het_job = list_next(iter))) { if (job_ptr->het_job_id != het_job->het_job_id) { error("%s: Bad het_job_list for %pJ", __func__, job_ptr); continue; } rc = _update_job(het_job, job_specs, uid); } list_iterator_destroy(iter); goto reply; } if (job_ptr && (((job_ptr->array_task_id == NO_VAL) && (job_ptr->array_recs == NULL)) || ((job_ptr->array_task_id != NO_VAL) && (job_ptr->array_job_id != job_id)))) { /* This is a regular job or single task of job array */ rc = _update_job(job_ptr, job_specs, uid); goto reply; } if (job_ptr && job_ptr->array_recs) { /* This is a job array */ job_ptr_done = job_ptr; if (job_ptr->array_recs->task_id_bitmap) job_specs->array_bitmap = bit_copy( job_ptr->array_recs->task_id_bitmap); rc2 = _update_job(job_ptr, job_specs, uid); if (rc2 == ESLURM_JOB_SETTING_DB_INX) { rc = rc2; goto reply; } _resp_array_add(&resp_array, job_ptr, rc2); } /* Update all tasks of this job array */ job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; if (!job_ptr && !job_ptr_done) { info("%s: invalid JobId=%u", __func__, job_id); rc = ESLURM_INVALID_JOB_ID; goto reply; } while (job_ptr) { if ((job_ptr->array_job_id == job_id) && (job_ptr != job_ptr_done)) { rc2 = _update_job(job_ptr, job_specs, uid); if (rc2 == ESLURM_JOB_SETTING_DB_INX) { rc = rc2; goto reply; } _resp_array_add(&resp_array, job_ptr, rc2); } job_ptr = job_ptr->job_array_next_j; } goto reply; } else if (end_ptr[0] == '+') { /* Hetjob element */ long_id = strtol(end_ptr+1, &tmp, 10); if ((long_id < 0) || (long_id == LONG_MAX) || (tmp[0] != '\0')) { info("%s: invalid JobId=%s", __func__, job_id_str); rc = ESLURM_INVALID_JOB_ID; goto reply; } het_job_offset = (uint32_t) long_id; job_ptr = find_het_job_record(job_id, het_job_offset); if (!job_ptr) { info("%s: invalid JobId=%u", __func__, job_id); rc = ESLURM_INVALID_JOB_ID; goto reply; } rc = _update_job(job_ptr, job_specs, uid); goto reply; } array_bitmap = bit_alloc(max_array_size); tmp = xstrdup(end_ptr + 1); tok = strtok_r(tmp, ",", &end_ptr); while (tok && valid) { valid = _parse_array_tok(tok, array_bitmap, max_array_size); tok = strtok_r(NULL, ",", &end_ptr); } xfree(tmp); if (valid) { i_last = bit_fls(array_bitmap); if (i_last < 0) valid = false; } if (!valid) { info("%s: invalid JobId=%s", __func__, job_id_str); rc = ESLURM_INVALID_JOB_ID; goto reply; } job_ptr = find_job_record(job_id); if (job_ptr && IS_JOB_PENDING(job_ptr) && job_ptr->array_recs && job_ptr->array_recs->task_id_bitmap) { /* Ensure bitmap sizes match for AND operations */ len = bit_size(job_ptr->array_recs->task_id_bitmap); i_last++; if (i_last < len) { array_bitmap = bit_realloc(array_bitmap, len); } else { array_bitmap = bit_realloc(array_bitmap, i_last); job_ptr->array_recs->task_id_bitmap = bit_realloc( job_ptr->array_recs->task_id_bitmap, i_last); } if (!bit_overlap_any(job_ptr->array_recs->task_id_bitmap, array_bitmap)) { /* Nothing to do with this job record */ } else if (bit_super_set(job_ptr->array_recs->task_id_bitmap, array_bitmap)) { /* Update the record with all pending tasks */ job_specs->array_bitmap = bit_copy(job_ptr->array_recs->task_id_bitmap); rc2 = _update_job(job_ptr, job_specs, uid); if (rc2 == ESLURM_JOB_SETTING_DB_INX) { rc = rc2; goto reply; } _resp_array_add(&resp_array, job_ptr, rc2); bit_and_not(array_bitmap, job_specs->array_bitmap); } else { /* Need to split out tasks to separate job records */ tmp_bitmap = bit_copy(job_ptr->array_recs-> task_id_bitmap); bit_and(tmp_bitmap, array_bitmap); i_first = bit_ffs(tmp_bitmap); if (i_first >= 0) i_last = bit_fls(tmp_bitmap); else i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(tmp_bitmap, i)) continue; job_ptr->array_task_id = i; new_job_ptr = job_array_split(job_ptr); if (!new_job_ptr) { error("%s: Unable to copy record for %pJ", __func__, job_ptr); } else { /* The array_recs structure is moved * to the new job record copy */ bb_g_job_validate2(job_ptr, NULL); job_ptr = new_job_ptr; } } FREE_NULL_BITMAP(tmp_bitmap); } } i_first = bit_ffs(array_bitmap); if (i_first >= 0) i_last = bit_fls(array_bitmap); else i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(array_bitmap, i)) continue; job_ptr = find_job_array_rec(job_id, i); if (job_ptr == NULL) { info("%s: invalid JobId=%u_%d", __func__, job_id, i); _resp_array_add_id(&resp_array, job_id, i, ESLURM_INVALID_JOB_ID); continue; } rc2 = _update_job(job_ptr, job_specs, uid); if (rc2 == ESLURM_JOB_SETTING_DB_INX) { rc = rc2; goto reply; } _resp_array_add(&resp_array, job_ptr, rc2); } reply: if ((rc != ESLURM_JOB_SETTING_DB_INX) && (msg->conn_fd >= 0)) { slurm_msg_t_init(&resp_msg); resp_msg.protocol_version = msg->protocol_version; if (resp_array) { resp_array_msg = _resp_array_xlate(resp_array, job_id); resp_msg.msg_type = RESPONSE_JOB_ARRAY_ERRORS; resp_msg.data = resp_array_msg; } else { resp_msg.msg_type = RESPONSE_SLURM_RC; rc_msg.return_code = rc; resp_msg.data = &rc_msg; } resp_msg.conn = msg->conn; slurm_send_node_msg(msg->conn_fd, &resp_msg); if (resp_array_msg) { slurm_free_job_array_resp(resp_array_msg); resp_msg.data = NULL; } } _resp_array_free(resp_array); FREE_NULL_BITMAP(array_bitmap); return rc; } static void _send_job_kill(job_record_t *job_ptr) { kill_job_msg_t *kill_job = NULL; agent_arg_t *agent_args = NULL; #ifdef HAVE_FRONT_END front_end_record_t *front_end_ptr; #else int i; node_record_t *node_ptr; #endif xassert(job_ptr); xassert(job_ptr->details); agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_TERMINATE_JOB; agent_args->retry = 0; /* re_kill_job() resends as needed */ agent_args->hostlist = hostlist_create(NULL); kill_job = xmalloc(sizeof(kill_job_msg_t)); last_node_update = time(NULL); kill_job->job_gres_info = gres_plugin_epilog_build_env(job_ptr->gres_list,job_ptr->nodes); kill_job->job_id = job_ptr->job_id; kill_job->het_job_id = job_ptr->het_job_id; kill_job->step_id = NO_VAL; kill_job->job_state = job_ptr->job_state; kill_job->job_uid = job_ptr->user_id; kill_job->job_gid = job_ptr->group_id; kill_job->nodes = xstrdup(job_ptr->nodes); kill_job->time = time(NULL); kill_job->start_time = job_ptr->start_time; kill_job->select_jobinfo = select_g_select_jobinfo_copy( job_ptr->select_jobinfo); kill_job->spank_job_env = xduparray(job_ptr->spank_job_env_size, job_ptr->spank_job_env); kill_job->spank_job_env_size = job_ptr->spank_job_env_size; #ifdef HAVE_FRONT_END if (job_ptr->batch_host && (front_end_ptr = job_ptr->front_end_ptr)) { agent_args->protocol_version = front_end_ptr->protocol_version; hostlist_push_host(agent_args->hostlist, job_ptr->batch_host); agent_args->node_count++; } #else if (!job_ptr->node_bitmap_cg) build_cg_bitmap(job_ptr); agent_args->protocol_version = SLURM_PROTOCOL_VERSION; for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count; i++, node_ptr++) { if (!bit_test(job_ptr->node_bitmap_cg, i)) continue; if (agent_args->protocol_version > node_ptr->protocol_version) agent_args->protocol_version = node_ptr->protocol_version; hostlist_push_host(agent_args->hostlist, node_ptr->name); agent_args->node_count++; } #endif if (agent_args->node_count == 0) { if (job_ptr->details->expanding_jobid == 0) { error("%s: %pJ allocated no nodes to be killed on", __func__, job_ptr); } xfree(kill_job->nodes); xfree(kill_job); hostlist_destroy(agent_args->hostlist); xfree(agent_args); return; } agent_args->msg_args = kill_job; agent_queue_request(agent_args); return; } /* Record accounting information for a job immediately before changing size */ extern void job_pre_resize_acctg(job_record_t *job_ptr) { /* if we don't have a db_index go a start this one up since if running with the slurmDBD the job may not have started yet. */ if ((!job_ptr->db_index || job_ptr->db_index == NO_VAL64) && !job_ptr->resize_time) jobacct_storage_g_job_start(acct_db_conn, job_ptr); job_ptr->job_state |= JOB_RESIZING; /* NOTE: job_completion_logger() calls * acct_policy_remove_job_submit() */ job_completion_logger(job_ptr, false); /* This doesn't happen in job_completion_logger, but gets * added back in with job_post_resize_acctg so remove it here. */ acct_policy_job_fini(job_ptr); /* NOTE: The RESIZING FLAG needed to be cleared with job_post_resize_acctg */ } /* Record accounting information for a job immediately after changing size */ extern void job_post_resize_acctg(job_record_t *job_ptr) { time_t org_submit = job_ptr->details->submit_time; /* * NOTE: The RESIZING FLAG needed to be set with job_pre_resize_acctg() * the assert is here to make sure we code it that way. */ xassert(IS_JOB_RESIZING(job_ptr)); acct_policy_add_job_submit(job_ptr); /* job_set_alloc_tres() must be called before acct_policy_job_begin() */ job_set_alloc_tres(job_ptr, false); acct_policy_job_begin(job_ptr); job_claim_resv(job_ptr); if (job_ptr->resize_time) job_ptr->details->submit_time = job_ptr->resize_time; job_ptr->resize_time = time(NULL); /* FIXME: see if this can be changed to job_start_direct() */ jobacct_storage_g_job_start(acct_db_conn, job_ptr); job_ptr->details->submit_time = org_submit; job_ptr->job_state &= (~JOB_RESIZING); /* * Reset the end_time_exp that was probably set to NO_VAL when * ending the job on the resize. If using the * priority/multifactor plugin if the end_time_exp is NO_VAL * it will not run again for the job. */ job_ptr->end_time_exp = job_ptr->end_time; /* * If a job is resized, the core bitmap will differ in the step. * See rebuild_step_bitmaps(). The problem will go away when we have * per-node core bitmaps. For now just set a flag that the job was * resized while there were active job steps. */ if (job_ptr->step_list && (list_count(job_ptr->step_list) > 0)) job_ptr->bit_flags |= JOB_RESIZED; } static char *_build_step_id(char *buf, int buf_len, uint32_t step_id) { if (step_id == SLURM_BATCH_SCRIPT) snprintf(buf, buf_len, "StepId=Batch"); else snprintf(buf, buf_len, "StepId=%u", step_id); return buf; } /* * validate_jobs_on_node - validate that any jobs that should be on the node * are actually running, if not clean up the job records and/or node * records. * IN reg_msg - node registration message */ extern void validate_jobs_on_node(slurm_node_registration_status_msg_t *reg_msg) { int i, node_inx, jobs_on_node; node_record_t *node_ptr; job_record_t *job_ptr; step_record_t *step_ptr; char step_str[64]; time_t now = time(NULL); node_ptr = find_node_record(reg_msg->node_name); if (node_ptr == NULL) { error("slurmd registered on unknown node %s", reg_msg->node_name); return; } if (reg_msg->energy) memcpy(node_ptr->energy, reg_msg->energy, sizeof(acct_gather_energy_t)); if (node_ptr->up_time > reg_msg->up_time) { verbose("Node %s rebooted %u secs ago", reg_msg->node_name, reg_msg->up_time); } if (reg_msg->up_time <= now) { node_ptr->up_time = reg_msg->up_time; node_ptr->boot_time = now - reg_msg->up_time; node_ptr->slurmd_start_time = reg_msg->slurmd_start_time; } else { error("Node up_time is invalid: %u>%u", reg_msg->up_time, (uint32_t) now); } if (waiting_for_node_boot(node_ptr)) return; node_inx = node_ptr - node_record_table_ptr; /* Check that jobs running are really supposed to be there */ for (i = 0; i < reg_msg->job_count; i++) { if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) && (reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) { info("NoAllocate JobId=%u %s reported on node %s", reg_msg->job_id[i], _build_step_id(step_str, sizeof(step_str), reg_msg->step_id[i]), reg_msg->node_name); continue; } job_ptr = find_job_record(reg_msg->job_id[i]); if (job_ptr == NULL) { error("Orphan JobId=%u %s reported on node %s", reg_msg->job_id[i], _build_step_id(step_str, sizeof(step_str), reg_msg->step_id[i]), reg_msg->node_name); abort_job_on_node(reg_msg->job_id[i], job_ptr, node_ptr->name); } else if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) { if (bit_test(job_ptr->node_bitmap, node_inx)) { if ((job_ptr->batch_flag) && (node_inx == bit_ffs( job_ptr->node_bitmap))) { /* NOTE: Used for purging defunct * batch jobs */ job_ptr->time_last_active = now; } step_ptr = find_step_record(job_ptr, reg_msg-> step_id[i]); if (step_ptr) step_ptr->time_last_active = now; debug3("Registered %pS on node %s", step_ptr, reg_msg->node_name); } else { /* Typically indicates a job requeue and * restart on another nodes. A node from the * original allocation just responded here. */ error("Registered %pJ %s on wrong node %s", job_ptr, _build_step_id(step_str, sizeof(step_str), reg_msg->step_id[i]), reg_msg->node_name); info("%s: job nodes %s count %d inx %d", __func__, job_ptr->nodes, job_ptr->node_cnt, node_inx); abort_job_on_node(reg_msg->job_id[i], job_ptr, node_ptr->name); } } else if (IS_JOB_COMPLETING(job_ptr)) { /* * Re-send kill request as needed, * not necessarily an error */ kill_job_on_node(job_ptr, node_ptr); } else if (IS_JOB_PENDING(job_ptr)) { /* Typically indicates a job requeue and the hung * slurmd that went DOWN is now responding */ error("Registered PENDING %pJ %s on node %s", job_ptr, _build_step_id(step_str, sizeof(step_str), reg_msg->step_id[i]), reg_msg->node_name); abort_job_on_node(reg_msg->job_id[i], job_ptr, node_ptr->name); } else if (difftime(now, job_ptr->end_time) < slurm_get_msg_timeout()) { /* Race condition */ debug("Registered newly completed %pJ %s on %s", job_ptr, _build_step_id(step_str, sizeof(step_str), reg_msg->step_id[i]), node_ptr->name); } else { /* else job is supposed to be done */ error("Registered %pJ %s in state %s on node %s", job_ptr, _build_step_id(step_str, sizeof(step_str), reg_msg->step_id[i]), job_state_string(job_ptr->job_state), reg_msg->node_name); kill_job_on_node(job_ptr, node_ptr); } } jobs_on_node = node_ptr->run_job_cnt + node_ptr->comp_job_cnt; if (jobs_on_node) _purge_missing_jobs(node_inx, now); if (jobs_on_node != reg_msg->job_count) { /* slurmd will not know of a job unless the job has * steps active at registration time, so this is not * an error condition, slurmd is also reporting steps * rather than jobs */ debug3("resetting job_count on node %s from %u to %d", reg_msg->node_name, reg_msg->job_count, jobs_on_node); reg_msg->job_count = jobs_on_node; } return; } /* Purge any batch job that should have its script running on node * node_inx, but is not. Allow BatchStartTimeout + ResumeTimeout seconds * for startup. * * Purge all job steps that were started before the node was last booted. * * Also notify srun if any job steps should be active on this node * but are not found. */ static void _purge_missing_jobs(int node_inx, time_t now) { ListIterator job_iterator; job_record_t *job_ptr; node_record_t *node_ptr = node_record_table_ptr + node_inx; uint16_t batch_start_timeout = slurm_get_batch_start_timeout(); uint16_t msg_timeout = slurm_get_msg_timeout(); uint16_t resume_timeout = slurm_get_resume_timeout(); uint32_t suspend_time = slurm_get_suspend_time(); time_t batch_startup_time, node_boot_time = (time_t) 0, startup_time; if (node_ptr->boot_time > (msg_timeout + 5)) { /* allow for message timeout and other delays */ node_boot_time = node_ptr->boot_time - (msg_timeout + 5); } batch_startup_time = now - batch_start_timeout; batch_startup_time -= MIN(DEFAULT_MSG_TIMEOUT, msg_timeout); job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { if ((IS_JOB_CONFIGURING(job_ptr) || (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) || (!bit_test(job_ptr->node_bitmap, node_inx))) continue; if ((job_ptr->batch_flag != 0) && (suspend_time != 0) /* power mgmt on */ && (job_ptr->start_time < node_boot_time)) { startup_time = batch_startup_time - resume_timeout; } else startup_time = batch_startup_time; if ((job_ptr->batch_flag != 0) && (job_ptr->het_job_offset == 0) && (job_ptr->time_last_active < startup_time) && (job_ptr->start_time < startup_time) && (node_ptr == find_node_record(job_ptr->batch_host))) { bool requeue = false; char *requeue_msg = ""; if (job_ptr->details && job_ptr->details->requeue) { requeue = true; requeue_msg = ", Requeuing job"; } info("Batch %pJ missing from batch node %s (not found BatchStartTime after startup)%s", job_ptr, job_ptr->batch_host, requeue_msg); job_ptr->exit_code = 1; job_complete(job_ptr->job_id, slurmctld_conf.slurm_user_id, requeue, true, NO_VAL); } else { _notify_srun_missing_step(job_ptr, node_inx, now, node_boot_time); } } list_iterator_destroy(job_iterator); } static void _notify_srun_missing_step(job_record_t *job_ptr, int node_inx, time_t now, time_t node_boot_time) { ListIterator step_iterator; step_record_t *step_ptr; char *node_name = node_record_table_ptr[node_inx].name; xassert(job_ptr); step_iterator = list_iterator_create (job_ptr->step_list); while ((step_ptr = list_next(step_iterator))) { if ((step_ptr->step_id == SLURM_EXTERN_CONT) || (step_ptr->step_id == SLURM_BATCH_SCRIPT) || (step_ptr->state != JOB_RUNNING)) continue; if (!bit_test(step_ptr->step_node_bitmap, node_inx)) continue; if (step_ptr->time_last_active >= now) { /* Back up timer in case more than one node * registration happens at this same time. * We don't want this node's registration * to count toward a different node's * registration message. */ step_ptr->time_last_active = now - 1; } else if (step_ptr->host && step_ptr->port) { /* srun may be able to verify step exists on * this node using I/O sockets and kill the * job as needed */ srun_step_missing(step_ptr, node_name); } else if ((step_ptr->start_time < node_boot_time) && (step_ptr->no_kill == 0)) { /* There is a risk that the job step's tasks completed * on this node before its reboot, but that should be * very rare and there is no srun to work with (POE) */ info("Node %s rebooted, killing missing step %u.%u", node_name, job_ptr->job_id, step_ptr->step_id); signal_step_tasks_on_node(node_name, step_ptr, SIGKILL, REQUEST_TERMINATE_TASKS); } } list_iterator_destroy (step_iterator); } /* * abort_job_on_node - Kill the specific job_id on a specific node, * the request is not processed immediately, but queued. * This is to prevent a flood of pthreads if slurmctld restarts * without saved state and slurmd daemons register with a * multitude of running jobs. Slurmctld will not recognize * these jobs and use this function to kill them - one * agent request per node as they register. * IN job_id - id of the job to be killed * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. job reported * by slurmd on some node, but job records already purged from * slurmctld) * IN node_name - name of the node on which the job resides */ extern void abort_job_on_node(uint32_t job_id, job_record_t *job_ptr, char *node_name) { agent_arg_t *agent_info; kill_job_msg_t *kill_req; kill_req = xmalloc(sizeof(kill_job_msg_t)); kill_req->job_id = job_id; kill_req->step_id = NO_VAL; kill_req->time = time(NULL); kill_req->nodes = xstrdup(node_name); if (job_ptr) { /* NULL if unknown */ kill_req->job_gres_info = gres_plugin_epilog_build_env(job_ptr->gres_list, job_ptr->nodes); kill_req->het_job_id = job_ptr->het_job_id; kill_req->start_time = job_ptr->start_time; kill_req->select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size, job_ptr->spank_job_env); kill_req->spank_job_env_size = job_ptr->spank_job_env_size; } else { /* kill_req->start_time = 0; Default value */ } agent_info = xmalloc(sizeof(agent_arg_t)); agent_info->node_count = 1; agent_info->retry = 0; agent_info->hostlist = hostlist_create(node_name); #ifdef HAVE_FRONT_END if (job_ptr && job_ptr->front_end_ptr) agent_info->protocol_version = job_ptr->front_end_ptr->protocol_version; if (job_ptr) { debug("Aborting %pJ on front end node %s", job_ptr, node_name); } else { debug("Aborting JobId=%u on front end node %s", job_id, node_name); } #else node_record_t *node_ptr; if ((node_ptr = find_node_record(node_name))) agent_info->protocol_version = node_ptr->protocol_version; if (job_ptr) debug("Aborting %pJ on node %s", job_ptr, node_name); else debug("Aborting JobId=%u on node %s", job_id, node_name); #endif agent_info->msg_type = REQUEST_ABORT_JOB; agent_info->msg_args = kill_req; agent_queue_request(agent_info); } /* * abort_job_on_nodes - Kill the specific job_on the specific nodes, * the request is not processed immediately, but queued. * This is to prevent a flood of pthreads if slurmctld restarts * without saved state and slurmd daemons register with a * multitude of running jobs. Slurmctld will not recognize * these jobs and use this function to kill them - one * agent request per node as they register. * IN job_ptr - pointer to terminating job * IN node_name - name of the node on which the job resides */ extern void abort_job_on_nodes(job_record_t *job_ptr, bitstr_t *node_bitmap) { bitstr_t *full_node_bitmap, *tmp_node_bitmap; node_record_t *node_ptr; int i, i_first, i_last; agent_arg_t *agent_info; kill_job_msg_t *kill_req; uint16_t protocol_version; #ifdef HAVE_FRONT_END fatal("%s: front-end mode not supported", __func__); #endif xassert(node_bitmap); /* Send a separate message for nodes at different protocol_versions */ full_node_bitmap = bit_copy(node_bitmap); while ((i_first = bit_ffs(full_node_bitmap)) >= 0) { i_last = bit_fls(full_node_bitmap); node_ptr = node_record_table_ptr + i_first; protocol_version = node_ptr->protocol_version; tmp_node_bitmap = bit_alloc(bit_size(node_bitmap)); for (i = i_first; i <= i_last; i++) { if (!bit_test(full_node_bitmap, i)) continue; node_ptr = node_record_table_ptr + i; if (node_ptr->protocol_version != protocol_version) continue; bit_clear(full_node_bitmap, i); bit_set(tmp_node_bitmap, i); } kill_req = xmalloc(sizeof(kill_job_msg_t)); kill_req->job_gres_info = gres_plugin_epilog_build_env(job_ptr->gres_list, job_ptr->nodes); kill_req->job_id = job_ptr->job_id; kill_req->step_id = NO_VAL; kill_req->time = time(NULL); kill_req->nodes = bitmap2node_name(tmp_node_bitmap); kill_req->het_job_id = job_ptr->het_job_id; kill_req->start_time = job_ptr->start_time; kill_req->select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size, job_ptr->spank_job_env); kill_req->spank_job_env_size = job_ptr->spank_job_env_size; agent_info = xmalloc(sizeof(agent_arg_t)); agent_info->node_count = bit_set_count(tmp_node_bitmap); agent_info->retry = 1; agent_info->hostlist = hostlist_create(kill_req->nodes); debug("Aborting %pJ on nodes %s", job_ptr, kill_req->nodes); agent_info->msg_type = REQUEST_ABORT_JOB; agent_info->msg_args = kill_req; agent_info->protocol_version = protocol_version; agent_queue_request(agent_info); bit_free(tmp_node_bitmap); } bit_free(full_node_bitmap); } /* * kill_job_on_node - Kill the specific job on a specific node. * IN job_ptr - pointer to terminating job (NULL if unknown, e.g. orphaned) * IN node_ptr - pointer to the node on which the job resides */ extern void kill_job_on_node(job_record_t *job_ptr, node_record_t *node_ptr) { agent_arg_t *agent_info; kill_job_msg_t *kill_req; kill_req = xmalloc(sizeof(kill_job_msg_t)); kill_req->job_gres_info = gres_plugin_epilog_build_env(job_ptr->gres_list,job_ptr->nodes); kill_req->het_job_id = job_ptr->het_job_id; kill_req->job_id = job_ptr->job_id; kill_req->step_id = NO_VAL; kill_req->time = time(NULL); kill_req->start_time = job_ptr->start_time; kill_req->nodes = xstrdup(node_ptr->name); kill_req->select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); kill_req->job_state = job_ptr->job_state; kill_req->spank_job_env = xduparray(job_ptr->spank_job_env_size, job_ptr->spank_job_env); kill_req->spank_job_env_size = job_ptr->spank_job_env_size; agent_info = xmalloc(sizeof(agent_arg_t)); agent_info->node_count = 1; agent_info->retry = 0; #ifdef HAVE_FRONT_END xassert(job_ptr->batch_host); if (job_ptr->front_end_ptr) agent_info->protocol_version = job_ptr->front_end_ptr->protocol_version; agent_info->hostlist = hostlist_create(job_ptr->batch_host); debug("Killing %pJ on front end node %s", job_ptr, job_ptr->batch_host); #else agent_info->protocol_version = node_ptr->protocol_version; agent_info->hostlist = hostlist_create(node_ptr->name); debug("Killing %pJ on node %s", job_ptr, node_ptr->name); #endif agent_info->msg_type = REQUEST_TERMINATE_JOB; agent_info->msg_args = kill_req; agent_queue_request(agent_info); } /* * Return true if this job is complete (including all elements of a hetjob) */ static bool _job_all_finished(job_record_t *job_ptr) { job_record_t *het_job; ListIterator iter; bool finished = true; if (!IS_JOB_FINISHED(job_ptr)) return false; if (!job_ptr->het_job_list) return true; iter = list_iterator_create(job_ptr->het_job_list); while ((het_job = list_next(iter))) { if (!IS_JOB_FINISHED(het_job)) { finished = false; break; } } list_iterator_destroy(iter); return finished; } /* * job_alloc_info_ptr - get details about an existing job allocation * IN uid - job issuing the code * IN job_ptr - pointer to job record * NOTE: See job_alloc_info() if job pointer not known */ extern int job_alloc_info_ptr(uint32_t uid, job_record_t *job_ptr) { uint8_t prolog = 0; if ((slurmctld_conf.private_data & PRIVATE_DATA_JOBS) && (job_ptr->user_id != uid) && !validate_operator(uid) && (((slurm_mcs_get_privatedata() == 0) && !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) || ((slurm_mcs_get_privatedata() == 1) && (mcs_g_check_mcs_label(uid, job_ptr->mcs_label) != 0)))) return ESLURM_ACCESS_DENIED; if (IS_JOB_PENDING(job_ptr)) return ESLURM_JOB_PENDING; if (_job_all_finished(job_ptr)) return ESLURM_ALREADY_DONE; if (job_ptr->details) prolog = job_ptr->details->prolog_running; if (job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") && (prolog == 0) && job_ptr->node_bitmap && (bit_overlap_any(power_node_bitmap, job_ptr->node_bitmap) == 0)) { last_job_update = time(NULL); set_job_alias_list(job_ptr); } return SLURM_SUCCESS; } /* * job_alloc_info - get details about an existing job allocation * IN uid - job issuing the code * IN job_id - ID of job for which info is requested * OUT job_pptr - set to pointer to job record * NOTE: See job_alloc_info_ptr() if job pointer is known */ extern int job_alloc_info(uint32_t uid, uint32_t job_id, job_record_t **job_pptr) { job_record_t *job_ptr; job_ptr = find_job_record(job_id); if (job_ptr == NULL) return ESLURM_INVALID_JOB_ID; if (job_pptr) *job_pptr = job_ptr; return job_alloc_info_ptr(uid, job_ptr); } /* * Synchronize the batch job in the system with their files. * All pending batch jobs must have script and environment files * No other jobs should have such files */ int sync_job_files(void) { List batch_dirs; xassert(verify_lock(CONF_LOCK, READ_LOCK)); xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); if (!slurmctld_primary) /* Don't purge files from backup slurmctld */ return SLURM_SUCCESS; batch_dirs = list_create(xfree_ptr); _get_batch_job_dir_ids(batch_dirs); _validate_job_files(batch_dirs); _remove_defunct_batch_dirs(batch_dirs); FREE_NULL_LIST(batch_dirs); return SLURM_SUCCESS; } /* Append to the batch_dirs list the job_id's associated with * every batch job directory in existence */ static void _get_batch_job_dir_ids(List batch_dirs) { DIR *f_dir, *h_dir; struct dirent *dir_ent, *hash_ent; long long_job_id; uint32_t *job_id_ptr; char *endptr; xassert(verify_lock(CONF_LOCK, READ_LOCK)); xassert(slurmctld_conf.state_save_location); f_dir = opendir(slurmctld_conf.state_save_location); if (!f_dir) { error("opendir(%s): %m", slurmctld_conf.state_save_location); return; } while ((dir_ent = readdir(f_dir))) { if (!xstrncmp("hash.#", dir_ent->d_name, 5)) { char *h_path = NULL; xstrfmtcat(h_path, "%s/%s", slurmctld_conf.state_save_location, dir_ent->d_name); h_dir = opendir(h_path); xfree(h_path); if (!h_dir) continue; while ((hash_ent = readdir(h_dir))) { if (xstrncmp("job.#", hash_ent->d_name, 4)) continue; long_job_id = strtol(&hash_ent->d_name[4], &endptr, 10); if ((long_job_id == 0) || (endptr[0] != '\0')) continue; debug3("Found batch directory for JobId=%ld", long_job_id); job_id_ptr = xmalloc(sizeof(uint32_t)); *job_id_ptr = long_job_id; list_append(batch_dirs, job_id_ptr); } closedir(h_dir); } } closedir(f_dir); } static int _clear_state_dir_flag(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *) x; job_ptr->bit_flags &= ~HAS_STATE_DIR; return 0; } static int _test_state_dir_flag(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *) x; if (job_ptr->bit_flags & HAS_STATE_DIR) { job_ptr->bit_flags &= ~HAS_STATE_DIR; return 0; } if (!job_ptr->batch_flag || !IS_JOB_PENDING(job_ptr) || (job_ptr->het_job_offset > 0)) return 0; /* No files expected */ error("Script for %pJ lost, state set to FAILED", job_ptr); job_ptr->job_state = JOB_FAILED; job_ptr->exit_code = 1; job_ptr->state_reason = FAIL_SYSTEM; xfree(job_ptr->state_desc); job_ptr->start_time = job_ptr->end_time = time(NULL); job_completion_logger(job_ptr, false); return 0; } /* All pending batch jobs must have a batch_dir entry, * otherwise we flag it as FAILED and don't schedule * If the batch_dir entry exists for a PENDING or RUNNING batch job, * remove it the list (of directories to be deleted) */ static void _validate_job_files(List batch_dirs) { job_record_t *job_ptr; ListIterator batch_dir_iter; uint32_t *job_id_ptr, array_job_id; list_for_each(job_list, _clear_state_dir_flag, NULL); batch_dir_iter = list_iterator_create(batch_dirs); while ((job_id_ptr = list_next(batch_dir_iter))) { job_ptr = find_job_record(*job_id_ptr); if (job_ptr) { job_ptr->bit_flags |= HAS_STATE_DIR; list_delete_item(batch_dir_iter); } if (job_ptr && job_ptr->array_recs) { /* Update all tasks */ array_job_id = job_ptr->array_job_id; job_ptr = job_array_hash_j[JOB_HASH_INX(array_job_id)]; while (job_ptr) { if (job_ptr->array_job_id == array_job_id) job_ptr->bit_flags |= HAS_STATE_DIR; job_ptr = job_ptr->job_array_next_j; } } } list_iterator_destroy(batch_dir_iter); list_for_each(job_list, _test_state_dir_flag, NULL); } /* Remove all batch_dir entries in the list */ static void _remove_defunct_batch_dirs(List batch_dirs) { ListIterator batch_dir_inx; uint32_t *job_id_ptr; xassert(verify_lock(CONF_LOCK, READ_LOCK)); batch_dir_inx = list_iterator_create(batch_dirs); while ((job_id_ptr = list_next(batch_dir_inx))) { info("Purged files for defunct batch JobId=%u", *job_id_ptr); delete_job_desc_files(*job_id_ptr); } list_iterator_destroy(batch_dir_inx); } /* * _xmit_new_end_time * Tell all slurmd's associated with a job of its new end time * IN job_ptr - pointer to terminating job * globals: node_record_count - number of nodes in the system * node_record_table_ptr - pointer to global node table */ static void _xmit_new_end_time(job_record_t *job_ptr) { #ifndef HAVE_FRONT_END int i; #endif job_time_msg_t *job_time_msg_ptr; agent_arg_t *agent_args; agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_UPDATE_JOB_TIME; agent_args->retry = 1; agent_args->hostlist = hostlist_create(NULL); job_time_msg_ptr = xmalloc(sizeof(job_time_msg_t)); job_time_msg_ptr->job_id = job_ptr->job_id; job_time_msg_ptr->expiration_time = job_ptr->end_time; #ifdef HAVE_FRONT_END xassert(job_ptr->batch_host); if (job_ptr->front_end_ptr) agent_args->protocol_version = job_ptr->front_end_ptr->protocol_version; hostlist_push_host(agent_args->hostlist, job_ptr->batch_host); agent_args->node_count = 1; #else agent_args->protocol_version = SLURM_PROTOCOL_VERSION; for (i = 0; i < node_record_count; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; if (agent_args->protocol_version > node_record_table_ptr[i].protocol_version) agent_args->protocol_version = node_record_table_ptr[i].protocol_version; hostlist_push_host(agent_args->hostlist, node_record_table_ptr[i].name); agent_args->node_count++; } #endif agent_args->msg_args = job_time_msg_ptr; agent_queue_request(agent_args); return; } /* * Return total amount of memory allocated to a job. This can be based upon * a GRES specification with various GRES/memory allocations on each node. * If current allocation information is not available, estimate memory based * upon pn_min_memory and either CPU or node count. */ extern uint64_t job_get_tres_mem(struct job_resources *job_res, uint64_t pn_min_memory, uint32_t cpu_cnt, uint32_t node_cnt) { uint64_t mem_total = 0; int i; if (job_res) { for (i = 0; i < job_res->nhosts; i++) { mem_total += job_res->memory_allocated[i]; } return mem_total; } if (pn_min_memory == NO_VAL64) return mem_total; if (pn_min_memory & MEM_PER_CPU) { if (cpu_cnt != NO_VAL) { mem_total = pn_min_memory & (~MEM_PER_CPU); mem_total *= cpu_cnt; } } else if (node_cnt != NO_VAL) mem_total = pn_min_memory * node_cnt; return mem_total; } /* * job_epilog_complete - Note the completion of the epilog script for a * given job * IN job_id - id of the job for which the epilog was executed * IN node_name - name of the node on which the epilog was executed * IN return_code - return code from epilog script * RET true if job is COMPLETED, otherwise false */ extern bool job_epilog_complete(uint32_t job_id, char *node_name, uint32_t return_code) { #ifdef HAVE_FRONT_END int i; #endif job_record_t *job_ptr = find_job_record(job_id); node_record_t *node_ptr; if (job_ptr == NULL) { debug("%s: unable to find JobId=%u for node=%s with return_code=%u.", __func__, job_id, node_name, return_code); return true; } trace_job(job_ptr, __func__, "enter"); /* * There is a potential race condition this handles. * If slurmctld cold-starts while slurmd keeps running, slurmd could * notify slurmctld of a job epilog completion before getting synced * up with slurmctld state. If a new job arrives and the job_id is * reused, we could try to note the termination of a job that hasn't * really started. Very rare obviously. */ if ((IS_JOB_PENDING(job_ptr) && (!IS_JOB_COMPLETING(job_ptr))) || (job_ptr->node_bitmap == NULL)) { #ifndef HAVE_FRONT_END uint32_t base_state = NODE_STATE_UNKNOWN; node_ptr = find_node_record(node_name); if (node_ptr) base_state = node_ptr->node_state & NODE_STATE_BASE; if (base_state == NODE_STATE_DOWN) { debug("%s: %pJ complete response from DOWN node %s", __func__, job_ptr, node_name); } else if (job_ptr->restart_cnt) { /* * Duplicate epilog complete can be due to race */ debug("%s: %pJ duplicate epilog complete response", __func__, job_ptr); } else { error("%s: %pJ is non-running slurmctld and slurmd out of sync", __func__, job_ptr); } #endif return false; } #ifdef HAVE_FRONT_END xassert(job_ptr->batch_host); /* * If there is a bad epilog error don't down the frontend node. * If needed the nodes in use by the job will be downed below. */ if (return_code) error("%s: %pJ epilog error on %s", __func__, job_ptr, job_ptr->batch_host); if (job_ptr->front_end_ptr && IS_JOB_COMPLETING(job_ptr)) { front_end_record_t *front_end_ptr = job_ptr->front_end_ptr; if (front_end_ptr->job_cnt_comp) front_end_ptr->job_cnt_comp--; else { error("%s: %pJ job_cnt_comp underflow on front end %s", __func__, job_ptr, front_end_ptr->name); } if (front_end_ptr->job_cnt_comp == 0) front_end_ptr->node_state &= (~NODE_STATE_COMPLETING); } if ((job_ptr->total_nodes == 0) && IS_JOB_COMPLETING(job_ptr)) { /* * Job resources moved into another job and * tasks already killed */ front_end_record_t *front_end_ptr = job_ptr->front_end_ptr; if (front_end_ptr) front_end_ptr->node_state &= (~NODE_STATE_COMPLETING); } else { for (i = 0; i < node_record_count; i++) { if (!bit_test(job_ptr->node_bitmap, i)) continue; node_ptr = &node_record_table_ptr[i]; if (return_code) { drain_nodes(node_ptr->name, "Epilog error", slurmctld_conf.slurm_user_id); } /* Change job from completing to completed */ make_node_idle(node_ptr, job_ptr); } } #else if (return_code) { error("%s: %pJ epilog error on %s, draining the node", __func__, job_ptr, node_name); drain_nodes(node_name, "Epilog error", slurmctld_conf.slurm_user_id); } /* Change job from completing to completed */ node_ptr = find_node_record(node_name); if (node_ptr) make_node_idle(node_ptr, job_ptr); #endif step_epilog_complete(job_ptr, node_name); /* nodes_completing is out of date, rebuild when next saved */ xfree(job_ptr->nodes_completing); if (!IS_JOB_COMPLETING(job_ptr)) { /* COMPLETED */ batch_requeue_fini(job_ptr); return true; } else return false; } /* Complete a batch job requeue logic after all steps complete so that * subsequent jobs appear in a separate accounting record. */ void batch_requeue_fini(job_record_t *job_ptr) { if (IS_JOB_COMPLETING(job_ptr) || !IS_JOB_PENDING(job_ptr) || !job_ptr->batch_flag) return; info("Requeuing %pJ", job_ptr); /* Clear everything so this appears to be a new job and then restart * it in accounting. */ job_ptr->start_time = 0; job_ptr->end_time_exp = job_ptr->end_time = 0; job_ptr->total_cpus = 0; job_ptr->pre_sus_time = 0; job_ptr->preempt_time = 0; job_ptr->suspend_time = 0; job_ptr->tot_sus_time = 0; /* Current code (<= 2.1) has it so we start the new job with the next * step id. This could be used when restarting to figure out which * step the previous run of this job stopped on. */ //job_ptr->next_step_id = 0; job_ptr->node_cnt = 0; xfree(job_ptr->nodes); xfree(job_ptr->nodes_completing); FREE_NULL_BITMAP(job_ptr->node_bitmap); FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); job_resv_clear_promiscous_flag(job_ptr); if (job_ptr->details) { time_t now = time(NULL); /* The time stamp on the new batch launch credential must be * larger than the time stamp on the revoke request. Also the * I/O must be all cleared out, the named socket purged and * the job credential purged by slurmd. */ if (job_ptr->details->begin_time <= now) { /* See src/common/slurm_cred.c * #define DEFAULT_EXPIRATION_WINDOW 1200 */ int cred_lifetime = 1200; (void) slurm_cred_ctx_get(slurmctld_config.cred_ctx, SLURM_CRED_OPT_EXPIRY_WINDOW, &cred_lifetime); job_ptr->details->begin_time = now + cred_lifetime + 1; } /* Since this could happen on a launch we need to make sure the * submit isn't the same as the last submit so put now + 1 so * we get different records in the database */ if (now == job_ptr->details->submit_time) now++; job_ptr->details->submit_time = now; /* clear the accrue flag */ job_ptr->bit_flags &= ~JOB_ACCRUE_OVER; job_ptr->details->accrue_time = 0; if ((job_ptr->details->whole_node == 1) && job_ptr->gres_list) { /* * We need to reset the gres_list to what was requested * instead of what was given exclusively. */ FREE_NULL_LIST(job_ptr->gres_list); (void)gres_plugin_job_state_validate( job_ptr->cpus_per_tres, job_ptr->tres_freq, job_ptr->tres_per_job, job_ptr->tres_per_node, job_ptr->tres_per_socket, job_ptr->tres_per_task, job_ptr->mem_per_tres, &job_ptr->details->num_tasks, &job_ptr->details->min_nodes, &job_ptr->details->max_nodes, &job_ptr->details->ntasks_per_node, &job_ptr->details->mc_ptr->ntasks_per_socket, &job_ptr->details->mc_ptr->sockets_per_node, &job_ptr->details->cpus_per_task, &job_ptr->gres_list); } } /* * If a reservation ended and was a repeated (e.g., daily, weekly) * reservation, its ID will be different; make sure * job->resv_id matches the reservation id. */ if (job_ptr->resv_ptr) job_ptr->resv_id = job_ptr->resv_ptr->resv_id; /* Reset this after the batch step has finished or the batch step * information will be attributed to the next run of the job. */ job_ptr->db_index = 0; if (!with_slurmdbd) jobacct_storage_g_job_start(acct_db_conn, job_ptr); /* Submit new sibling jobs for fed jobs */ if (fed_mgr_is_origin_job(job_ptr)) { if (fed_mgr_job_requeue(job_ptr)) { error("failed to submit requeued sibling jobs for fed %pJ", job_ptr); } } } /* job_fini - free all memory associated with job records */ void job_fini (void) { FREE_NULL_LIST(job_list); xfree(job_hash); xfree(job_array_hash_j); xfree(job_array_hash_t); FREE_NULL_LIST(purge_files_list); FREE_NULL_BITMAP(requeue_exit); FREE_NULL_BITMAP(requeue_exit_hold); } /* Record the start of one job array task */ extern void job_array_start(job_record_t *job_ptr) { job_record_t *base_job_ptr; if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { base_job_ptr = find_job_record(job_ptr->array_job_id); if (base_job_ptr && base_job_ptr->array_recs) { base_job_ptr->array_recs->tot_run_tasks++; } } } /* Return true if a job array task can be started */ extern bool job_array_start_test(job_record_t *job_ptr) { job_record_t *base_job_ptr; time_t now = time(NULL); if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { base_job_ptr = find_job_record(job_ptr->array_job_id); if (base_job_ptr && base_job_ptr->array_recs && (base_job_ptr->array_recs->max_run_tasks != 0) && (base_job_ptr->array_recs->tot_run_tasks >= base_job_ptr->array_recs->max_run_tasks)) { if (job_ptr->details && (job_ptr->details->begin_time <= now)) job_ptr->details->begin_time = (time_t) 0; xfree(job_ptr->state_desc); job_ptr->state_reason = WAIT_ARRAY_TASK_LIMIT; return false; } } return true; } static void _job_array_comp(job_record_t *job_ptr, bool was_running, bool requeue) { job_record_t *base_job_ptr; uint32_t status; if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { status = job_ptr->exit_code; if ((status == 0) && !IS_JOB_COMPLETE(job_ptr)) { /* Avoid max_exit_code == 0 if task did not run to * successful completion (e.g. Cancelled, NodeFail) */ status = 9; } base_job_ptr = find_job_record(job_ptr->array_job_id); if (base_job_ptr && base_job_ptr->array_recs) { if (requeue) { base_job_ptr->array_recs->array_flags |= ARRAY_TASK_REQUEUED; } else if (!base_job_ptr->array_recs->tot_comp_tasks) { base_job_ptr->array_recs->min_exit_code = status; base_job_ptr->array_recs->max_exit_code = status; } else { base_job_ptr->array_recs->min_exit_code = MIN(status, base_job_ptr-> array_recs->min_exit_code); base_job_ptr->array_recs->max_exit_code = MAX(status, base_job_ptr-> array_recs->max_exit_code); } if (was_running && base_job_ptr->array_recs->tot_run_tasks) base_job_ptr->array_recs->tot_run_tasks--; base_job_ptr->array_recs->tot_comp_tasks++; } } } /* log the completion of the specified job */ extern void job_completion_logger(job_record_t *job_ptr, bool requeue) { int base_state; bool arr_finished = false, task_failed = false, task_requeued = false; bool was_running = false; job_record_t *master_job = NULL; uint32_t max_exit_code = 0; xassert(job_ptr); acct_policy_remove_job_submit(job_ptr); if (job_ptr->nodes && ((job_ptr->bit_flags & JOB_KILL_HURRY) == 0) && !IS_JOB_RESIZING(job_ptr)) { (void) bb_g_job_start_stage_out(job_ptr); } else if (job_ptr->nodes && IS_JOB_RESIZING(job_ptr)){ debug("%s: %pJ resizing, skipping bb stage_out", __func__, job_ptr); } else { /* * Never allocated compute nodes. * Unless job ran, there is no data to stage-out */ (void) bb_g_job_cancel(job_ptr); } if (job_ptr->bit_flags & JOB_WAS_RUNNING) { job_ptr->bit_flags &= ~JOB_WAS_RUNNING; was_running = true; } _job_array_comp(job_ptr, was_running, requeue); if (!IS_JOB_RESIZING(job_ptr) && !IS_JOB_PENDING(job_ptr) && ((job_ptr->array_task_id == NO_VAL) || (job_ptr->mail_type & MAIL_ARRAY_TASKS) || (arr_finished = test_job_array_finished(job_ptr->array_job_id)))) { /* Remove configuring state just to make sure it isn't there * since it will throw off displays of the job. */ job_ptr->job_state &= ~JOB_CONFIGURING; /* make sure all parts of the job are notified * Fed Jobs: only signal the srun from where the job is running * or from the origin if the job wasn't running. */ if (!job_ptr->fed_details || fed_mgr_job_is_self_owned(job_ptr) || (fed_mgr_is_origin_job(job_ptr) && !fed_mgr_job_is_locked(job_ptr))) srun_job_complete(job_ptr); /* mail out notifications of completion */ if (arr_finished) { /* We need to summarize different tasks states. */ master_job = find_job_record(job_ptr->array_job_id); if (master_job && master_job->array_recs) { task_requeued = (master_job->array_recs->array_flags & ARRAY_TASK_REQUEUED); if (task_requeued && (job_ptr->mail_type & MAIL_JOB_REQUEUE)) { /* * At least 1 task requeued and job * req. to be notified on requeues. */ mail_job_info(master_job, MAIL_JOB_REQUEUE); } max_exit_code = master_job->array_recs->max_exit_code; task_failed = (WIFEXITED(max_exit_code) && WEXITSTATUS(max_exit_code)); if (task_failed && (job_ptr->mail_type & MAIL_JOB_FAIL)) { /* * At least 1 task failed and job * req. to be notified on failures. */ mail_job_info(master_job, MAIL_JOB_FAIL); } else if (job_ptr->mail_type & MAIL_JOB_END) { /* * Job req. to be notified on END. */ mail_job_info(job_ptr, MAIL_JOB_END); } } } else { base_state = job_ptr->job_state & JOB_STATE_BASE; if ((base_state == JOB_COMPLETE) || (base_state == JOB_CANCELLED)) { if (requeue && (job_ptr->mail_type & MAIL_JOB_REQUEUE)) { mail_job_info(job_ptr, MAIL_JOB_REQUEUE); } else if (job_ptr->mail_type & MAIL_JOB_END) { mail_job_info(job_ptr, MAIL_JOB_END); } } else { /* JOB_FAILED, JOB_TIMEOUT, etc. */ if (job_ptr->mail_type & MAIL_JOB_FAIL) mail_job_info(job_ptr, MAIL_JOB_FAIL); else if (job_ptr->mail_type & MAIL_JOB_END) mail_job_info(job_ptr, MAIL_JOB_END); } } } g_slurm_jobcomp_write(job_ptr); /* When starting the resized job everything is taken care of * elsewhere, so don't call it here. */ if (IS_JOB_RESIZING(job_ptr)) return; if (!with_slurmdbd && !job_ptr->db_index) jobacct_storage_g_job_start(acct_db_conn, job_ptr); if (!(job_ptr->bit_flags & TRES_STR_CALC) && job_ptr->tres_alloc_cnt && (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) set_job_tres_alloc_str(job_ptr, false); jobacct_storage_g_job_complete(acct_db_conn, job_ptr); } /* * job_independent - determine if this job has a dependent job pending * or if the job's scheduled begin time is in the future * IN job_ptr - pointer to job being tested * RET - true if job no longer must be deferred for another job */ extern bool job_independent(job_record_t *job_ptr) { struct job_details *detail_ptr = job_ptr->details; time_t now = time(NULL); int depend_rc; if ((job_ptr->state_reason == FAIL_BURST_BUFFER_OP) || (job_ptr->state_reason == FAIL_ACCOUNT) || (job_ptr->state_reason == FAIL_QOS) || (job_ptr->state_reason == WAIT_HELD) || (job_ptr->state_reason == WAIT_HELD_USER) || (job_ptr->state_reason == WAIT_MAX_REQUEUE) || (job_ptr->state_reason == WAIT_RESV_DELETED) || (job_ptr->state_reason == WAIT_DEP_INVALID)) return false; /* Test dependencies first so we can cancel jobs before dependent * job records get purged (e.g. afterok, afternotok) */ depend_rc = test_job_dependency(job_ptr, NULL); if ((depend_rc == LOCAL_DEPEND) || (depend_rc == REMOTE_DEPEND)) { /* start_time has passed but still has dependency which * makes it ineligible */ if (detail_ptr->begin_time < now) detail_ptr->begin_time = 0; job_ptr->state_reason = WAIT_DEPENDENCY; xfree(job_ptr->state_desc); return false; } else if (depend_rc == FAIL_DEPEND) { handle_invalid_dependency(job_ptr); return false; } /* Job is eligible to start now */ if (job_ptr->state_reason == WAIT_DEPENDENCY) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); /* Submit the job to its siblings. */ if (job_ptr->details) { fed_mgr_job_requeue(job_ptr); } } /* Check for maximum number of running tasks in a job array */ if (!job_array_start_test(job_ptr)) return false; if (detail_ptr && (detail_ptr->begin_time > now)) { job_ptr->state_reason = WAIT_TIME; xfree(job_ptr->state_desc); return false; /* not yet time */ } if (job_test_resv_now(job_ptr) != SLURM_SUCCESS) { job_ptr->state_reason = WAIT_RESERVATION; xfree(job_ptr->state_desc); return false; /* not yet time */ } if ((detail_ptr && (detail_ptr->begin_time == 0) && (job_ptr->priority != 0))) { detail_ptr->begin_time = now; /* * Send begin time to the database if it is already there, or it * won't get there until the job starts. */ jobacct_storage_job_start_direct(acct_db_conn, job_ptr); } else if (job_ptr->state_reason == WAIT_TIME) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); } return true; } /* * determine if job is ready to execute per the node select plugin * IN job_id - job to test * OUT ready - 1 if job is ready to execute 0 otherwise * RET Slurm error code */ extern int job_node_ready(uint32_t job_id, int *ready) { int rc; job_record_t *job_ptr; xassert(ready); *ready = 0; job_ptr = find_job_record(job_id); if (job_ptr == NULL) return ESLURM_INVALID_JOB_ID; /* Always call select_g_job_ready() so that select/bluegene can * test and update block state information. */ rc = select_g_job_ready(job_ptr); if (rc == READY_JOB_FATAL) return ESLURM_INVALID_PARTITION_NAME; if (rc == READY_JOB_ERROR) return EAGAIN; if (rc) rc = READY_NODE_STATE; if (job_ptr->details && job_ptr->details->prolog_running) rc &= (~READY_NODE_STATE); if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) rc |= READY_JOB_STATE; if ((rc == (READY_NODE_STATE | READY_JOB_STATE)) && job_ptr->alias_list && !xstrcmp(job_ptr->alias_list, "TBD") && job_ptr->node_bitmap && (bit_overlap_any(power_node_bitmap, job_ptr->node_bitmap) == 0)) { last_job_update = time(NULL); set_job_alias_list(job_ptr); } *ready = rc; return SLURM_SUCCESS; } /* Send specified signal to all steps associated with a job */ static void _signal_job(job_record_t *job_ptr, int signal, uint16_t flags) { #ifndef HAVE_FRONT_END int i; #endif agent_arg_t *agent_args = NULL; signal_tasks_msg_t *signal_job_msg = NULL; static int notify_srun_static = -1; int notify_srun = 0; if (notify_srun_static == -1) { /* do this for all but slurm (poe, aprun, etc...) */ if (xstrcmp(slurmctld_conf.launch_type, "launch/slurm")) notify_srun_static = 1; else notify_srun_static = 0; } #ifdef HAVE_FRONT_END /* On a front end system always notify_srun instead of slurmd */ if (notify_srun_static) notify_srun = 1; #else /* For launch/poe all signals are forwarded by srun to poe to tasks * except SIGSTOP/SIGCONT, which are used for job preemption. In that * case the slurmd must directly suspend tasks and switch resources. */ if (notify_srun_static && (signal != SIGSTOP) && (signal != SIGCONT)) notify_srun = 1; #endif if (notify_srun) { ListIterator step_iterator; step_record_t *step_ptr; step_iterator = list_iterator_create(job_ptr->step_list); while ((step_ptr = list_next(step_iterator))) { /* Since we have already checked the uid, * we can send this signal as uid 0. */ job_step_signal(job_ptr->job_id, step_ptr->step_id, signal, 0, 0); } list_iterator_destroy (step_iterator); return; } agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_SIGNAL_TASKS; agent_args->retry = 1; agent_args->hostlist = hostlist_create(NULL); signal_job_msg = xmalloc(sizeof(signal_tasks_msg_t)); signal_job_msg->job_id = job_ptr->job_id; /* * We don't ever want to kill a step with this message. The flags below * will make sure that does happen. Just in case though, set the * step_id to an impossible number. */ signal_job_msg->job_step_id = slurmctld_conf.max_step_cnt + 1; /* * Encode the flags for slurm stepd to know what steps get signaled * Here if we aren't signaling the full job we always only want to * signal all other steps. */ if ((flags & KILL_FULL_JOB) || (flags & KILL_JOB_BATCH) || (flags & KILL_STEPS_ONLY)) signal_job_msg->flags = flags; else signal_job_msg->flags = KILL_STEPS_ONLY; signal_job_msg->signal = signal; #ifdef HAVE_FRONT_END xassert(job_ptr->batch_host); if (job_ptr->front_end_ptr) agent_args->protocol_version = job_ptr->front_end_ptr->protocol_version; hostlist_push_host(agent_args->hostlist, job_ptr->batch_host); agent_args->node_count = 1; #else agent_args->protocol_version = SLURM_PROTOCOL_VERSION; for (i = 0; i < node_record_count; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; if (agent_args->protocol_version > node_record_table_ptr[i].protocol_version) agent_args->protocol_version = node_record_table_ptr[i].protocol_version; hostlist_push_host(agent_args->hostlist, node_record_table_ptr[i].name); agent_args->node_count++; } #endif if (agent_args->node_count == 0) { xfree(signal_job_msg); xfree(agent_args); return; } agent_args->msg_args = signal_job_msg; agent_queue_request(agent_args); return; } static void *_switch_suspend_info(job_record_t *job_ptr) { ListIterator step_iterator; step_record_t *step_ptr; void *switch_suspend_info = NULL; step_iterator = list_iterator_create (job_ptr->step_list); while ((step_ptr = list_next(step_iterator))) { if (step_ptr->state != JOB_RUNNING) continue; switch_g_job_suspend_info_get(step_ptr->switch_job, &switch_suspend_info); } list_iterator_destroy (step_iterator); return switch_suspend_info; } /* Send suspend request to slumrd of all nodes associated with a job * job_ptr IN - job to be suspended or resumed * op IN - SUSPEND_JOB or RESUME_JOB * indf_susp IN - set if job is being suspended indefinitely by user * or admin, otherwise suspended for gang scheduling */ static void _suspend_job(job_record_t *job_ptr, uint16_t op, bool indf_susp) { #ifndef HAVE_FRONT_END int i; #endif agent_arg_t *agent_args; suspend_int_msg_t *sus_ptr; agent_args = xmalloc(sizeof(agent_arg_t)); agent_args->msg_type = REQUEST_SUSPEND_INT; agent_args->retry = 0; /* don't resend, gang scheduler can * quickly induce huge backlog * of agent.c RPCs */ agent_args->hostlist = hostlist_create(NULL); sus_ptr = xmalloc(sizeof(suspend_int_msg_t)); sus_ptr->job_core_spec = job_ptr->details->core_spec; sus_ptr->job_id = job_ptr->job_id; sus_ptr->op = op; sus_ptr->indf_susp = indf_susp; sus_ptr->switch_info = _switch_suspend_info(job_ptr); #ifdef HAVE_FRONT_END xassert(job_ptr->batch_host); if (job_ptr->front_end_ptr) { agent_args->protocol_version = job_ptr->front_end_ptr->protocol_version; } hostlist_push_host(agent_args->hostlist, job_ptr->batch_host); agent_args->node_count = 1; #else agent_args->protocol_version = SLURM_PROTOCOL_VERSION; for (i = 0; i < node_record_count; i++) { if (bit_test(job_ptr->node_bitmap, i) == 0) continue; if (agent_args->protocol_version > node_record_table_ptr[i].protocol_version) agent_args->protocol_version = node_record_table_ptr[i].protocol_version; hostlist_push_host(agent_args->hostlist, node_record_table_ptr[i].name); agent_args->node_count++; } #endif if (agent_args->node_count == 0) { slurm_free_suspend_int_msg(sus_ptr); xfree(agent_args); return; } agent_args->msg_args = sus_ptr; agent_queue_request(agent_args); return; } /* * Specified job is being suspended, release allocated nodes * job_ptr IN - job to be suspended * indf_susp IN - set if job is being suspended indefinitely by user * or admin, otherwise suspended for gang scheduling */ static int _suspend_job_nodes(job_record_t *job_ptr, bool indf_susp) { int i, i_first, i_last, rc = SLURM_SUCCESS; node_record_t *node_ptr; uint32_t node_flags; time_t now = time(NULL); if ((rc = select_g_job_suspend(job_ptr, indf_susp)) != SLURM_SUCCESS) return rc; i_first = bit_ffs(job_ptr->node_bitmap); if (i_first >= 0) i_last = bit_fls(job_ptr->node_bitmap); else i_last = -2; node_ptr = node_record_table_ptr + i_first; for (i = i_first; i <= i_last; i++, node_ptr++) { if (!bit_test(job_ptr->node_bitmap, i)) continue; node_ptr->sus_job_cnt++; if (node_ptr->run_job_cnt) (node_ptr->run_job_cnt)--; else { error("%s: %pJ node %s run_job_cnt underflow", __func__, job_ptr, node_ptr->name); } if (job_ptr->details && (job_ptr->details->share_res == 0)) { if (node_ptr->no_share_job_cnt) (node_ptr->no_share_job_cnt)--; else { error("%s: %pJ node %s no_share_job_cnt underflow", __func__, job_ptr, node_ptr->name); } if (node_ptr->no_share_job_cnt == 0) bit_set(share_node_bitmap, i); } node_flags = node_ptr->node_state & NODE_STATE_FLAGS; if ((node_ptr->run_job_cnt == 0) && (node_ptr->comp_job_cnt == 0)) { bit_set(idle_node_bitmap, i); } if (IS_NODE_DOWN(node_ptr)) { debug3("%s: %pJ node %s left DOWN", __func__, job_ptr, node_ptr->name); } else if (node_ptr->run_job_cnt) { node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; } else { node_ptr->node_state = NODE_STATE_IDLE | node_flags; node_ptr->last_idle = now; } } last_job_update = last_node_update = now; return rc; } /* * Specified job is being resumed, re-allocate the nodes * job_ptr IN - job to be resumed * indf_susp IN - set i f job is being resumed from indefinite suspend by user * or admin, otherwise resume from gang scheduling */ static int _resume_job_nodes(job_record_t *job_ptr, bool indf_susp) { int i, i_first, i_last, rc = SLURM_SUCCESS; node_record_t *node_ptr; uint32_t node_flags; if ((rc = select_g_job_resume(job_ptr, indf_susp)) != SLURM_SUCCESS) return rc; i_first = bit_ffs(job_ptr->node_bitmap); if (i_first >= 0) i_last = bit_fls(job_ptr->node_bitmap); else i_last = -2; node_ptr = node_record_table_ptr + i_first; for (i = i_first; i <= i_last; i++, node_ptr++) { if (!bit_test(job_ptr->node_bitmap, i)) continue; if (IS_NODE_DOWN(node_ptr)) return SLURM_ERROR; } node_ptr = node_record_table_ptr + i_first; for (i = i_first; i <= i_last; i++, node_ptr++) { if (!bit_test(job_ptr->node_bitmap, i)) continue; if (node_ptr->sus_job_cnt) (node_ptr->sus_job_cnt)--; else { error("Node %s sus_job_cnt underflow", node_ptr->name); } node_ptr->run_job_cnt++; if (job_ptr->details && (job_ptr->details->share_res == 0)) { node_ptr->no_share_job_cnt++; if (node_ptr->no_share_job_cnt) bit_clear(share_node_bitmap, i); } if (slurm_mcs_get_select(job_ptr) == 1) { xfree(node_ptr->mcs_label); node_ptr->mcs_label = xstrdup(job_ptr->mcs_label); } bit_clear(idle_node_bitmap, i); node_flags = node_ptr->node_state & NODE_STATE_FLAGS; node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags; } last_job_update = last_node_update = time(NULL); return rc; } static int _job_suspend_switch_test(job_record_t *job_ptr) { int rc = SLURM_SUCCESS; ListIterator step_iterator; step_record_t *step_ptr; step_iterator = list_iterator_create(job_ptr->step_list); while ((step_ptr = list_next(step_iterator))) { if (step_ptr->state != JOB_RUNNING) continue; rc = switch_g_job_suspend_test(step_ptr->switch_job); if (rc != SLURM_SUCCESS) break; } list_iterator_destroy (step_iterator); return rc; } /* * Determine if a job can be resumed. * Check for multiple jobs on the same nodes with core specialization. * RET 0 on success, otherwise ESLURM error code */ static int _job_resume_test(job_record_t *job_ptr) { int rc = SLURM_SUCCESS; ListIterator job_iterator; job_record_t *test_job_ptr; if ((job_ptr->details == NULL) || (job_ptr->details->core_spec == NO_VAL16) || (job_ptr->node_bitmap == NULL)) return rc; job_iterator = list_iterator_create(job_list); while ((test_job_ptr = list_next(job_iterator))) { if (test_job_ptr->details && (test_job_ptr->details->core_spec != NO_VAL16) && IS_JOB_RUNNING(test_job_ptr) && test_job_ptr->node_bitmap && bit_overlap_any(test_job_ptr->node_bitmap, job_ptr->node_bitmap)) { rc = ESLURM_NODES_BUSY; break; } /* FIXME: Also test for ESLURM_INTERCONNECT_BUSY */ } list_iterator_destroy(job_iterator); return rc; } /* * _job_suspend_op - perform some suspend/resume operation on a job * op IN - operation: suspend/resume * indf_susp IN - set if job is being suspended indefinitely by user or admin * and we should clear it's priority, otherwise suspended * temporarily for gang scheduling * RET 0 on success, otherwise ESLURM error code */ static int _job_suspend_op(job_record_t *job_ptr, uint16_t op, bool indf_susp) { int rc = SLURM_SUCCESS; time_t now = time(NULL); if (IS_JOB_PENDING(job_ptr)) return ESLURM_JOB_PENDING; if (IS_JOB_FINISHED(job_ptr)) return ESLURM_ALREADY_DONE; if ((op == SUSPEND_JOB) && (_job_suspend_switch_test(job_ptr) != SLURM_SUCCESS)) return ESLURM_NOT_SUPPORTED; if ((op == RESUME_JOB) && (rc = _job_resume_test(job_ptr))) return rc; /* perform the operation */ if (op == SUSPEND_JOB) { if (IS_JOB_SUSPENDED(job_ptr) && indf_susp) { debug("%s: Holding %pJ, re-suspend operation", __func__, job_ptr); job_ptr->priority = 0; /* Prevent gang sched resume */ return SLURM_SUCCESS; } if (!IS_JOB_RUNNING(job_ptr)) return ESLURM_JOB_NOT_RUNNING; rc = _suspend_job_nodes(job_ptr, indf_susp); if (rc != SLURM_SUCCESS) return rc; _suspend_job(job_ptr, op, indf_susp); job_ptr->job_state = JOB_SUSPENDED; if (indf_susp) { /* Job being manually suspended, not gang */ debug("%s: Holding %pJ, suspend operation", __func__, job_ptr); job_ptr->priority = 0; (void) gs_job_fini(job_ptr); } if (job_ptr->suspend_time) { job_ptr->pre_sus_time += difftime(now, job_ptr->suspend_time); } else { job_ptr->pre_sus_time += difftime(now, job_ptr->start_time); } suspend_job_step(job_ptr); } else if (op == RESUME_JOB) { if (!IS_JOB_SUSPENDED(job_ptr)) return ESLURM_JOB_NOT_SUSPENDED; rc = _resume_job_nodes(job_ptr, indf_susp); power_g_job_resume(job_ptr); if (rc != SLURM_SUCCESS) return rc; _suspend_job(job_ptr, op, indf_susp); if (job_ptr->priority == 0) { /* Job was manually suspended, not gang */ set_job_prio(job_ptr); (void) gs_job_start(job_ptr); } job_ptr->job_state = JOB_RUNNING; job_ptr->tot_sus_time += difftime(now, job_ptr->suspend_time); if ((job_ptr->time_limit != INFINITE) && (!job_ptr->preempt_time)) { debug3("%pJ resumed, updating end_time", job_ptr); job_ptr->end_time_exp = job_ptr->end_time = now + (job_ptr->time_limit * 60) - job_ptr->pre_sus_time; } resume_job_step(job_ptr); } job_ptr->time_last_active = now; job_ptr->suspend_time = now; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); return rc; } /* * _job_suspend - perform some suspend/resume operation, if the specified * job records is a hetjob leader, perform the operation on all * components of the hetjob * job_ptr - job to operate upon * op IN - operation: suspend/resume * indf_susp IN - set if job is being suspended indefinitely by user or admin * and we should clear it's priority, otherwise suspended * temporarily for gang scheduling * RET 0 on success, otherwise ESLURM error code */ static int _job_suspend(job_record_t *job_ptr, uint16_t op, bool indf_susp) { job_record_t *het_job; int rc = SLURM_SUCCESS, rc1; ListIterator iter; if (job_ptr->het_job_id && !job_ptr->het_job_list) return ESLURM_NOT_WHOLE_HET_JOB; /* Notify salloc/srun of suspend/resume */ srun_job_suspend(job_ptr, op); if (job_ptr->het_job_list) { iter = list_iterator_create(job_ptr->het_job_list); while ((het_job = list_next(iter))) { if (job_ptr->het_job_id != het_job->het_job_id) { error("%s: Bad het_job_list for %pJ", __func__, job_ptr); continue; } rc1 = _job_suspend_op(het_job, op, indf_susp); if (rc1 != SLURM_SUCCESS) rc = rc1; } list_iterator_destroy(iter); } else { rc = _job_suspend_op(job_ptr, op, indf_susp); } return rc; } /* * job_suspend - perform some suspend/resume operation * NOTE: job_suspend - Uses the job_id field and ignores job_id_str * * IN sus_ptr - suspend/resume request message * IN uid - user id of the user issuing the RPC * IN conn_fd - file descriptor on which to send reply, * -1 if none * indf_susp IN - set if job is being suspended indefinitely by user or admin * and we should clear it's priority, otherwise suspended * temporarily for gang scheduling * IN protocol_version - slurm protocol version of client * RET 0 on success, otherwise ESLURM error code */ extern int job_suspend(suspend_msg_t *sus_ptr, uid_t uid, int conn_fd, bool indf_susp, uint16_t protocol_version) { int rc = SLURM_SUCCESS; job_record_t *job_ptr = NULL; slurm_msg_t resp_msg; return_code_msg_t rc_msg; xfree(sus_ptr->job_id_str); xstrfmtcat(sus_ptr->job_id_str, "%u", sus_ptr->job_id); /* validate the request */ if (!validate_operator(uid)) { error("SECURITY VIOLATION: Attempt to suspend job from user %u", (int) uid); rc = ESLURM_ACCESS_DENIED; goto reply; } /* find the job */ job_ptr = find_job_record (sus_ptr->job_id); if (job_ptr == NULL) { rc = ESLURM_INVALID_JOB_ID; goto reply; } rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); reply: /* Since we have already used it lets make sure we don't leak memory */ xfree(sus_ptr->job_id_str); if (conn_fd >= 0) { slurm_msg_t_init(&resp_msg); resp_msg.protocol_version = protocol_version; resp_msg.msg_type = RESPONSE_SLURM_RC; memset(&rc_msg, 0, sizeof(rc_msg)); rc_msg.return_code = rc; resp_msg.data = &rc_msg; slurm_send_node_msg(conn_fd, &resp_msg); } return rc; } /* * job_suspend2 - perform some suspend/resume operation * NB job_suspend2 - Ignores the job_id field and uses job_id_str * * IN sus_ptr - suspend/resume request message * IN uid - user id of the user issuing the RPC * IN conn_fd - file descriptor on which to send reply, * -1 if none * indf_susp IN - set if job is being suspended indefinitely by user or admin * and we should clear it's priority, otherwise suspended * temporarily for gang scheduling * IN protocol_version - slurm protocol version of client * RET 0 on success, otherwise ESLURM error code */ extern int job_suspend2(suspend_msg_t *sus_ptr, uid_t uid, int conn_fd, bool indf_susp, uint16_t protocol_version) { int rc = SLURM_SUCCESS, rc2; job_record_t *job_ptr = NULL; long int long_id; uint32_t job_id = 0; char *end_ptr = NULL, *tok, *tmp; bitstr_t *array_bitmap = NULL; bool valid = true; int32_t i, i_first, i_last; slurm_msg_t resp_msg; return_code_msg_t rc_msg; resp_array_struct_t *resp_array = NULL; job_array_resp_msg_t *resp_array_msg = NULL; if (max_array_size == NO_VAL) { max_array_size = slurmctld_conf.max_array_sz; } /* validate the request */ if (!validate_operator(uid)) { error("SECURITY VIOLATION: Attempt to suspend job from user %u", (int) uid); rc = ESLURM_ACCESS_DENIED; goto reply; } long_id = strtol(sus_ptr->job_id_str, &end_ptr, 10); if (end_ptr[0] == '+') rc = ESLURM_NOT_WHOLE_HET_JOB; else if ((long_id <= 0) || (long_id == LONG_MAX) || ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) rc = ESLURM_INVALID_JOB_ID; if (rc != SLURM_SUCCESS) { info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str); goto reply; } job_id = (uint32_t) long_id; if (end_ptr[0] == '\0') { /* Single job (or full job array) */ job_record_t *job_ptr_done = NULL; job_ptr = find_job_record(job_id); if (job_ptr && (((job_ptr->array_task_id == NO_VAL) && (job_ptr->array_recs == NULL)) || ((job_ptr->array_task_id != NO_VAL) && (job_ptr->array_job_id != job_id)))) { /* This is a regular job or single task of job array */ rc = _job_suspend(job_ptr, sus_ptr->op, indf_susp); goto reply; } if (job_ptr && job_ptr->array_recs) { /* This is a job array */ job_ptr_done = job_ptr; rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp); _resp_array_add(&resp_array, job_ptr, rc2); } /* Suspend all tasks of this job array */ job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; if (!job_ptr && !job_ptr_done) { rc = ESLURM_INVALID_JOB_ID; goto reply; } while (job_ptr) { if ((job_ptr->array_job_id == job_id) && (job_ptr != job_ptr_done)) { rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp); _resp_array_add(&resp_array, job_ptr, rc2); } job_ptr = job_ptr->job_array_next_j; } goto reply; } array_bitmap = bit_alloc(max_array_size); tmp = xstrdup(end_ptr + 1); tok = strtok_r(tmp, ",", &end_ptr); while (tok && valid) { valid = _parse_array_tok(tok, array_bitmap, max_array_size); tok = strtok_r(NULL, ",", &end_ptr); } xfree(tmp); if (valid) { i_last = bit_fls(array_bitmap); if (i_last < 0) valid = false; } if (!valid) { info("%s: invalid JobId=%s", __func__, sus_ptr->job_id_str); rc = ESLURM_INVALID_JOB_ID; goto reply; } i_first = bit_ffs(array_bitmap); if (i_first >= 0) i_last = bit_fls(array_bitmap); else i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(array_bitmap, i)) continue; job_ptr = find_job_array_rec(job_id, i); if (job_ptr == NULL) { info("%s: invalid JobId=%u_%d", __func__, job_id, i); _resp_array_add_id(&resp_array, job_id, i, ESLURM_INVALID_JOB_ID); continue; } rc2 = _job_suspend(job_ptr, sus_ptr->op, indf_susp); _resp_array_add(&resp_array, job_ptr, rc2); } reply: if (conn_fd >= 0) { slurm_msg_t_init(&resp_msg); resp_msg.protocol_version = protocol_version; if (resp_array) { resp_array_msg = _resp_array_xlate(resp_array, job_id); resp_msg.msg_type = RESPONSE_JOB_ARRAY_ERRORS; resp_msg.data = resp_array_msg; } else { resp_msg.msg_type = RESPONSE_SLURM_RC; rc_msg.return_code = rc; resp_msg.data = &rc_msg; } slurm_send_node_msg(conn_fd, &resp_msg); if (resp_array_msg) { slurm_free_job_array_resp(resp_array_msg); resp_msg.data = NULL; } } _resp_array_free(resp_array); FREE_NULL_BITMAP(array_bitmap); return rc; } /* * _job_requeue_op - Requeue a running or pending batch job * IN uid - user id of user issuing the RPC * IN job_ptr - job to be requeued * IN preempt - true if job being preempted * RET 0 on success, otherwise ESLURM error code */ static int _job_requeue_op(uid_t uid, job_record_t *job_ptr, bool preempt, uint32_t flags) { bool is_running = false, is_suspended = false, is_completed = false; bool is_completing = false; time_t now = time(NULL); uint32_t completing_flags = 0; /* validate the request */ if ((uid != job_ptr->user_id) && !validate_operator(uid) && !assoc_mgr_is_user_acct_coord(acct_db_conn, uid, job_ptr->account)) { return ESLURM_ACCESS_DENIED; } if (((flags & JOB_STATE_BASE) == JOB_RUNNING) && !IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) { return SLURM_SUCCESS; } if (flags & JOB_RECONFIG_FAIL) node_features_g_get_node(job_ptr->nodes); /* * If the partition was removed don't allow the job to be * requeued. If it doesn't have details then something is very * wrong and if the job doesn't want to be requeued don't. */ if (!job_ptr->part_ptr || !job_ptr->details || !job_ptr->details->requeue) { if (flags & JOB_RECONFIG_FAIL) (void) _job_fail(job_ptr, JOB_BOOT_FAIL); return ESLURM_DISABLED; } if (job_ptr->batch_flag == 0) { debug("Job-requeue can only be done for batch jobs"); if (flags & JOB_RECONFIG_FAIL) (void) _job_fail(job_ptr, JOB_BOOT_FAIL); return ESLURM_BATCH_ONLY; } /* * If the job is already pending, just return an error. * A federated origin job can be pending and revoked with a sibling job * on another cluster. */ if (IS_JOB_PENDING(job_ptr) && (!job_ptr->fed_details || !job_ptr->fed_details->cluster_lock)) return ESLURM_JOB_PENDING; if ((flags & JOB_RECONFIG_FAIL) && IS_JOB_CANCELLED(job_ptr)) { /* * Job was cancelled (likely be the user) while node * reconfiguration was in progress, so don't requeue it * if the node reconfiguration failed. */ return ESLURM_DISABLED; } if (job_ptr->fed_details) { int rc; if ((rc = fed_mgr_job_requeue_test(job_ptr, flags))) return rc; /* Sent requeue request to origin cluster */ if (job_ptr->job_state & JOB_REQUEUE_FED) return SLURM_SUCCESS; } last_job_update = now; /* * In the job is in the process of completing * return SLURM_SUCCESS and set the status * to JOB_PENDING since we support requeue * of done/exit/exiting jobs. */ if (IS_JOB_COMPLETING(job_ptr)) { completing_flags = job_ptr->job_state & JOB_STATE_FLAGS; is_completing = true; } if (IS_JOB_SUSPENDED(job_ptr)) { uint32_t suspend_job_state = job_ptr->job_state; /* * we can't have it as suspended when we call the * accounting stuff. */ job_ptr->job_state = JOB_REQUEUE; jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); job_ptr->job_state = suspend_job_state; is_suspended = true; } job_ptr->time_last_active = now; if (is_suspended) job_ptr->end_time = job_ptr->suspend_time; else if (!is_completing) job_ptr->end_time = now; /* * Save the state of the job so that * we deallocate the nodes if is in * running state. */ if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) is_running = true; else if (IS_JOB_COMPLETED(job_ptr)) is_completed = true; /* Only change state to requeue for local jobs */ if (fed_mgr_is_origin_job(job_ptr) && !fed_mgr_is_tracker_only_job(job_ptr)) { /* * We want this job to have the requeued/preempted state in the * accounting logs. Set a new submit time so the restarted * job looks like a new job. */ if (preempt) { job_ptr->job_state = JOB_PREEMPTED; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, false); job_ptr->job_state = JOB_REQUEUE; } else { job_ptr->job_state = JOB_REQUEUE; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, true); } } /* * Increment restart counter before completing reply so that completing * jobs get counted and so that fed jobs get counted before submitting * new siblings in batch_requeue_fini() */ job_ptr->restart_cnt++; if (is_completing) { job_ptr->job_state = JOB_PENDING | completing_flags; goto reply; } /* * Deallocate resources only if the job has some. * JOB_COMPLETING is needed to properly clean up steps. */ if (is_running) { job_ptr->job_state |= JOB_COMPLETING; deallocate_nodes(job_ptr, false, is_suspended, preempt); job_ptr->job_state &= (~JOB_COMPLETING); } /* do this after the epilog complete, setting it here is too early */ //job_ptr->db_index = 0; //job_ptr->details->submit_time = now; job_ptr->job_state = JOB_PENDING; if (job_ptr->node_cnt) job_ptr->job_state |= JOB_COMPLETING; /* * Mark the origin job as requeueing. Will finish requeueing fed job * after job has completed. * If it's completed, batch_requeue_fini is called below and will call * fed_mgr_job_requeue() to submit new siblings. * If it's not completed, batch_requeue_fini will either be called when * the running origin job finishes or the running remote sibling job * reports that the job is finished. */ if (job_ptr->fed_details && !is_completed) { job_ptr->job_state |= JOB_COMPLETING; job_ptr->job_state |= JOB_REQUEUE_FED; } /* * If we set the time limit it means the user didn't so reset * it here or we could bust some limit when we try again */ if (job_ptr->limit_set.time == 1) { job_ptr->time_limit = NO_VAL; job_ptr->limit_set.time = 0; } reply: job_ptr->pre_sus_time = (time_t) 0; job_ptr->suspend_time = (time_t) 0; job_ptr->tot_sus_time = (time_t) 0; job_ptr->db_flags = 0; /* clear signal sent flag on requeue */ job_ptr->warn_flags &= ~WARN_SENT; /* * Since the job completion logger removes the submit we need * to add it again. */ acct_policy_add_job_submit(job_ptr); acct_policy_update_pending_job(job_ptr); if (flags & JOB_SPECIAL_EXIT) { job_ptr->job_state |= JOB_SPECIAL_EXIT; job_ptr->state_reason = WAIT_HELD_USER; xfree(job_ptr->state_desc); job_ptr->state_desc = xstrdup("job requeued in special exit state"); debug("%s: Holding %pJ, special exit", __func__, job_ptr); job_ptr->priority = 0; } if (flags & JOB_REQUEUE_HOLD) { job_ptr->state_reason = WAIT_HELD_USER; xfree(job_ptr->state_desc); if (flags & JOB_LAUNCH_FAILED) { job_ptr->state_desc = xstrdup("launch failed requeued held"); } else { job_ptr->state_desc = xstrdup("job requeued in held state"); } debug("%s: Holding %pJ, requeue-hold exit", __func__, job_ptr); job_ptr->priority = 0; } /* * When jobs are requeued while running/completing batch_requeue_fini is * called after the job is completely finished. If the job is already * finished it needs to be called to clear out states (especially the * db_index or we will just write over the last job in the database). * Call batch_requeue_fini after setting priority to 0 for requeue_hold * and special_exit so federation doesn't submit siblings for held job. */ if (is_completed) batch_requeue_fini(job_ptr); debug("%s: %pJ state 0x%x reason %u priority %d", __func__, job_ptr, job_ptr->job_state, job_ptr->state_reason, job_ptr->priority); return SLURM_SUCCESS; } /* * _job_requeue - Requeue a running or pending batch job, if the specified * job records is a hetjob leader, perform the operation on all * components of the hetjob * IN uid - user id of user issuing the RPC * IN job_ptr - job to be requeued * IN preempt - true if job being preempted * RET 0 on success, otherwise ESLURM error code */ static int _job_requeue(uid_t uid, job_record_t *job_ptr, bool preempt, uint32_t flags) { job_record_t *het_job; int rc = SLURM_SUCCESS, rc1; ListIterator iter; if (job_ptr->het_job_id && !job_ptr->het_job_list) return ESLURM_NOT_HET_JOB_LEADER; if (job_ptr->het_job_list) { iter = list_iterator_create(job_ptr->het_job_list); while ((het_job = list_next(iter))) { if (job_ptr->het_job_id != het_job->het_job_id) { error("%s: Bad het_job_list for %pJ", __func__, job_ptr); continue; } rc1 = _job_requeue_op(uid, het_job, preempt, flags); if (rc1 != SLURM_SUCCESS) rc = rc1; } list_iterator_destroy(iter); } else { rc = _job_requeue_op(uid, job_ptr, preempt, flags); } return rc; } /* * job_requeue - Requeue a running or pending batch job * IN uid - user id of user issuing the RPC * IN job_id - id of the job to be requeued * IN msg - slurm_msg to send response back on * IN preempt - true if job being preempted * IN flags - JobExitRequeue | Hold | JobFailed | etc. * RET 0 on success, otherwise ESLURM error code */ extern int job_requeue(uid_t uid, uint32_t job_id, slurm_msg_t *msg, bool preempt, uint32_t flags) { int rc = SLURM_SUCCESS; job_record_t *job_ptr = NULL; /* find the job */ job_ptr = find_job_record(job_id); if (job_ptr == NULL) { rc = ESLURM_INVALID_JOB_ID; } else { /* _job_requeue already handles het jobs */ rc = _job_requeue(uid, job_ptr, preempt, flags); } if (msg) { slurm_send_rc_msg(msg, rc); } return rc; } /* * job_requeue2 - Requeue a running or pending batch job * IN uid - user id of user issuing the RPC * IN req_ptr - request including ID of the job to be requeued * IN msg - slurm_msg to send response back on * IN preempt - true if job being preempted * RET 0 on success, otherwise ESLURM error code */ extern int job_requeue2(uid_t uid, requeue_msg_t *req_ptr, slurm_msg_t *msg, bool preempt) { int rc = SLURM_SUCCESS, rc2; job_record_t *job_ptr = NULL; long int long_id; uint32_t job_id = 0; char *end_ptr = NULL, *tok, *tmp; bitstr_t *array_bitmap = NULL; bool valid = true; int32_t i, i_first, i_last; slurm_msg_t resp_msg; return_code_msg_t rc_msg; uint32_t flags = req_ptr->flags; char *job_id_str = req_ptr->job_id_str; resp_array_struct_t *resp_array = NULL; job_array_resp_msg_t *resp_array_msg = NULL; if (max_array_size == NO_VAL) { max_array_size = slurmctld_conf.max_array_sz; } long_id = strtol(job_id_str, &end_ptr, 10); if ((long_id <= 0) || (long_id == LONG_MAX) || ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) { info("%s: invalid JobId=%s", __func__, job_id_str); rc = ESLURM_INVALID_JOB_ID; goto reply; } if ((end_ptr[0] == '_') && (end_ptr[1] == '*')) end_ptr += 2; /* Defaults to full job array */ job_id = (uint32_t) long_id; if (end_ptr[0] == '\0') { /* Single job (or full job array) */ job_record_t *job_ptr_done = NULL; job_ptr = find_job_record(job_id); if (job_ptr && (((job_ptr->array_task_id == NO_VAL) && (job_ptr->array_recs == NULL)) || ((job_ptr->array_task_id != NO_VAL) && (job_ptr->array_job_id != job_id)))) { /* This is a regular job or single task of job array */ rc = _job_requeue(uid, job_ptr, preempt, flags); goto reply; } if (job_ptr && job_ptr->array_recs) { /* This is a job array */ job_ptr_done = job_ptr; rc2 = _job_requeue(uid, job_ptr, preempt, flags); _resp_array_add(&resp_array, job_ptr, rc2); } /* Requeue all tasks of this job array */ job_ptr = job_array_hash_j[JOB_HASH_INX(job_id)]; if (!job_ptr && !job_ptr_done) { rc = ESLURM_INVALID_JOB_ID; goto reply; } while (job_ptr) { if ((job_ptr->array_job_id == job_id) && (job_ptr != job_ptr_done)) { rc2 = _job_requeue(uid, job_ptr, preempt,flags); _resp_array_add(&resp_array, job_ptr, rc2); } job_ptr = job_ptr->job_array_next_j; } goto reply; } array_bitmap = bit_alloc(max_array_size); tmp = xstrdup(end_ptr + 1); tok = strtok_r(tmp, ",", &end_ptr); while (tok && valid) { valid = _parse_array_tok(tok, array_bitmap, max_array_size); tok = strtok_r(NULL, ",", &end_ptr); } xfree(tmp); if (valid) { i_last = bit_fls(array_bitmap); if (i_last < 0) valid = false; } if (!valid) { info("%s: invalid JobId=%s", __func__, job_id_str); rc = ESLURM_INVALID_JOB_ID; goto reply; } i_first = bit_ffs(array_bitmap); if (i_first >= 0) i_last = bit_fls(array_bitmap); else i_last = -2; for (i = i_first; i <= i_last; i++) { if (!bit_test(array_bitmap, i)) continue; job_ptr = find_job_array_rec(job_id, i); if (job_ptr == NULL) { info("%s: invalid JobId=%u_%d", __func__, job_id, i); _resp_array_add_id(&resp_array, job_id, i, ESLURM_INVALID_JOB_ID); continue; } rc2 = _job_requeue(uid, job_ptr, preempt, flags); _resp_array_add(&resp_array, job_ptr, rc2); } reply: if (msg) { response_init(&resp_msg, msg); if (resp_array) { resp_array_msg = _resp_array_xlate(resp_array, job_id); resp_msg.msg_type = RESPONSE_JOB_ARRAY_ERRORS; resp_msg.data = resp_array_msg; } else { resp_msg.msg_type = RESPONSE_SLURM_RC; rc_msg.return_code = rc; resp_msg.data = &rc_msg; } slurm_send_node_msg(msg->conn_fd, &resp_msg); if (resp_array_msg) { slurm_free_job_array_resp(resp_array_msg); resp_msg.data = NULL; } } _resp_array_free(resp_array); FREE_NULL_BITMAP(array_bitmap); return rc; } static int _top_job_flag_clear(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *) x; job_ptr->bit_flags &= (~TOP_PRIO_TMP); return 0; } /* This sorts so the highest priorities come off the list first */ static int _top_job_prio_sort(void *x, void *y) { uint32_t *prio1, *prio2; prio1 = *(uint32_t **) x; prio2 = *(uint32_t **) y; if (*prio1 < *prio2) return 1; if (*prio1 > *prio2) return -1; return 0; } static int _set_top(List top_job_list, uid_t uid) { List prio_list, other_job_list; ListIterator iter; job_record_t *job_ptr, *first_job_ptr = NULL; int rc = SLURM_SUCCESS, rc2 = SLURM_SUCCESS; uint32_t last_prio = NO_VAL, next_prio; int64_t delta_prio, delta_nice, total_delta = 0; int other_job_cnt = 0; uint32_t *prio_elem; xassert(job_list); xassert(top_job_list); prio_list = list_create(xfree_ptr); (void) list_for_each(job_list, _top_job_flag_clear, NULL); /* Validate the jobs in our "top" list */ iter = list_iterator_create(top_job_list); while ((job_ptr = list_next(iter))) { if ((job_ptr->user_id != uid) && (uid != 0)) { error("Security violation: REQUEST_TOP_JOB for %pJ from uid=%u", job_ptr, uid); rc = ESLURM_ACCESS_DENIED; break; } if (!IS_JOB_PENDING(job_ptr) || (job_ptr->details == NULL)) { debug("%s: %pJ not pending", __func__, job_ptr); list_remove(iter); rc2 = ESLURM_JOB_NOT_PENDING; continue; } if (job_ptr->part_ptr_list) { debug("%s: %pJ in partition list", __func__, job_ptr); list_remove(iter); rc = ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE; break; } if (job_ptr->priority == 0) { debug("%s: %pJ is held", __func__, job_ptr); list_remove(iter); rc2 = ESLURM_JOB_HELD; continue; } if (job_ptr->bit_flags & TOP_PRIO_TMP) { /* Duplicate job ID */ list_remove(iter); continue; } if (!first_job_ptr) first_job_ptr = job_ptr; job_ptr->bit_flags |= TOP_PRIO_TMP; prio_elem = xmalloc(sizeof(uint32_t)); *prio_elem = job_ptr->priority; list_append(prio_list, prio_elem); } list_iterator_destroy(iter); if (rc != SLURM_SUCCESS) { FREE_NULL_LIST(prio_list); return rc; } if (!first_job_ptr) { FREE_NULL_LIST(prio_list); return rc2; } /* Identify other jobs which we can adjust the nice value of */ other_job_list = list_create(NULL); iter = list_iterator_create(job_list); while ((job_ptr = list_next(iter))) { /* * Do not select jobs with priority 0 (held), or * priority 1 (would be held if we lowered the priority). */ if ((job_ptr->bit_flags & TOP_PRIO_TMP) || (job_ptr->details == NULL) || (job_ptr->part_ptr_list) || (job_ptr->priority <= 1) || (job_ptr->assoc_ptr != first_job_ptr->assoc_ptr) || (job_ptr->part_ptr != first_job_ptr->part_ptr) || (job_ptr->qos_ptr != first_job_ptr->qos_ptr) || (job_ptr->user_id != first_job_ptr->user_id) || (!IS_JOB_PENDING(job_ptr))) continue; other_job_cnt++; job_ptr->bit_flags |= TOP_PRIO_TMP; prio_elem = xmalloc(sizeof(uint32_t)); *prio_elem = job_ptr->priority; list_append(prio_list, prio_elem); list_append(other_job_list, job_ptr); } list_iterator_destroy(iter); /* Now adjust nice values and priorities of the listed "top" jobs */ list_sort(prio_list, _top_job_prio_sort); iter = list_iterator_create(top_job_list); while ((job_ptr = list_next(iter))) { prio_elem = list_pop(prio_list); next_prio = *prio_elem; xfree(prio_elem); if ((last_prio != NO_VAL) && (next_prio == last_prio) && (last_prio > 2)) /* * We don't want to set job priority lower than 1, so * last_prio cannot be smaller than 2, since we will * later use last_prio - 1 for the new job priority. */ next_prio = last_prio - 1; last_prio = next_prio; delta_prio = (int64_t) next_prio - job_ptr->priority; delta_nice = MIN(job_ptr->details->nice, delta_prio); total_delta += delta_nice; job_ptr->priority = next_prio; job_ptr->details->nice -= delta_nice; job_ptr->bit_flags &= (~TOP_PRIO_TMP); } list_iterator_destroy(iter); FREE_NULL_LIST(prio_list); /* Now adjust nice values and priorities of remaining effected jobs */ if (other_job_cnt) { iter = list_iterator_create(other_job_list); while ((job_ptr = list_next(iter))) { delta_prio = total_delta / other_job_cnt; next_prio = job_ptr->priority - delta_prio; if (next_prio >= last_prio) { next_prio = last_prio - 1; delta_prio = job_ptr->priority - next_prio; } delta_nice = delta_prio; job_ptr->priority = next_prio; job_ptr->details->nice += delta_nice; job_ptr->bit_flags &= (~TOP_PRIO_TMP); total_delta -= delta_nice; if (--other_job_cnt == 0) break; /* Count will match list size anyway */ } list_iterator_destroy(iter); } FREE_NULL_LIST(other_job_list); last_job_update = time(NULL); return rc; } /* * job_set_top - Move the specified jobs to the top of the queue (at least * for that user ID, partition, account, and QOS). * * IN top_ptr - user request * IN uid - user id of the user issuing the RPC * IN conn_fd - file descriptor on which to send reply, * -1 if none * IN protocol_version - slurm protocol version of client * RET 0 on success, otherwise ESLURM error code */ extern int job_set_top(top_job_msg_t *top_ptr, uid_t uid, int conn_fd, uint16_t protocol_version) { int rc = SLURM_SUCCESS; List top_job_list = NULL; char *job_str_tmp = NULL, *tok, *save_ptr = NULL, *end_ptr = NULL; job_record_t *job_ptr = NULL; long int long_id; uint32_t job_id = 0, task_id = 0; slurm_msg_t resp_msg; return_code_msg_t rc_msg; if (validate_operator(uid)) { uid = 0; } else { bool disable_user_top = true; char *sched_params = slurm_get_sched_params(); if (xstrcasestr(sched_params, "enable_user_top")) disable_user_top = false; xfree(sched_params); if (disable_user_top) { rc = ESLURM_ACCESS_DENIED; goto reply; } } top_job_list = list_create(NULL); job_str_tmp = xstrdup(top_ptr->job_id_str); tok = strtok_r(job_str_tmp, ",", &save_ptr); while (tok) { long_id = strtol(tok, &end_ptr, 10); if ((long_id <= 0) || (long_id == LONG_MAX) || ((end_ptr[0] != '\0') && (end_ptr[0] != '_'))) { info("%s: invalid job id %s", __func__, tok); rc = ESLURM_INVALID_JOB_ID; goto reply; } job_id = (uint32_t) long_id; if ((end_ptr[0] == '\0') || /* Single job (or full job array) */ ((end_ptr[0] == '_') && (end_ptr[1] == '*') && (end_ptr[2] == '\0'))) { job_ptr = find_job_record(job_id); if (!job_ptr) { rc = ESLURM_INVALID_JOB_ID; goto reply; } list_append(top_job_list, job_ptr); } else if (end_ptr[0] != '_') { /* Invalid job ID spec */ rc = ESLURM_INVALID_JOB_ID; goto reply; } else { /* Single task of a job array */ task_id = strtol(end_ptr + 1, &end_ptr, 10); if (end_ptr[0] != '\0') { /* Invalid job ID spec */ rc = ESLURM_INVALID_JOB_ID; goto reply; } job_ptr = find_job_array_rec(job_id, task_id); if (!job_ptr) { rc = ESLURM_INVALID_JOB_ID; goto reply; } list_append(top_job_list, job_ptr); } tok = strtok_r(NULL, ",", &save_ptr); } if (list_count(top_job_list) == 0) { rc = ESLURM_INVALID_JOB_ID; goto reply; } rc = _set_top(top_job_list, uid); reply: FREE_NULL_LIST(top_job_list); xfree(job_str_tmp); if (conn_fd >= 0) { slurm_msg_t_init(&resp_msg); resp_msg.protocol_version = protocol_version; resp_msg.msg_type = RESPONSE_SLURM_RC; memset(&rc_msg, 0, sizeof(rc_msg)); rc_msg.return_code = rc; resp_msg.data = &rc_msg; slurm_send_node_msg(conn_fd, &resp_msg); } return rc; } /* * job_end_time - Process JOB_END_TIME * IN time_req_msg - job end time request * OUT timeout_msg - job timeout response to be sent * RET SLURM_SUCCESS or an error code */ extern int job_end_time(job_alloc_info_msg_t *time_req_msg, srun_timeout_msg_t *timeout_msg) { job_record_t *job_ptr; xassert(timeout_msg); job_ptr = find_job_record(time_req_msg->job_id); if (!job_ptr) return ESLURM_INVALID_JOB_ID; memset(timeout_msg, 0, sizeof(srun_timeout_msg_t)); timeout_msg->job_id = time_req_msg->job_id; timeout_msg->step_id = NO_VAL; timeout_msg->timeout = job_ptr->end_time; return SLURM_SUCCESS; } /* Reset nodes_completing field for all jobs. */ extern void update_job_nodes_completing(void) { ListIterator job_iterator; job_record_t *job_ptr; xassert(verify_lock(JOB_LOCK, WRITE_LOCK)); if (!job_list) return; job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { if ((!IS_JOB_COMPLETING(job_ptr)) || (job_ptr->node_bitmap == NULL)) continue; xfree(job_ptr->nodes_completing); if (job_ptr->node_bitmap_cg) { job_ptr->nodes_completing = bitmap2node_name(job_ptr->node_bitmap_cg); } else { job_ptr->nodes_completing = bitmap2node_name(job_ptr->node_bitmap); } } list_iterator_destroy(job_iterator); } /* * job_hold_by_assoc_id - Hold all pending jobs with a given * association ID. This happens when an association is deleted (e.g. when * a user is removed from the association database). * RET count of held jobs */ extern int job_hold_by_assoc_id(uint32_t assoc_id) { int cnt = 0; ListIterator job_iterator; job_record_t *job_ptr; /* Write lock on jobs */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; if (!job_list) return cnt; lock_slurmctld(job_write_lock); job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { if (job_ptr->assoc_id != assoc_id) continue; cnt += _job_fail_account(job_ptr, __func__); } list_iterator_destroy(job_iterator); unlock_slurmctld(job_write_lock); return cnt; } /* * job_hold_by_qos_id - Hold all pending jobs with a given * QOS ID. This happens when a QOS is deleted (e.g. when * a QOS is removed from the association database). * RET count of held jobs */ extern int job_hold_by_qos_id(uint32_t qos_id) { int cnt = 0; ListIterator job_iterator; job_record_t *job_ptr; /* Write lock on jobs */ slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; if (!job_list) return cnt; lock_slurmctld(job_write_lock); job_iterator = list_iterator_create(job_list); while ((job_ptr = list_next(job_iterator))) { if (job_ptr->qos_blocking_ptr && ((slurmdb_qos_rec_t *)job_ptr->qos_blocking_ptr)->id == qos_id) job_ptr->qos_blocking_ptr = NULL; if (job_ptr->qos_id != qos_id) continue; cnt += job_fail_qos(job_ptr, __func__); } list_iterator_destroy(job_iterator); unlock_slurmctld(job_write_lock); return cnt; } /* * Modify the account associated with a pending job * IN module - where this is called from * IN job_ptr - pointer to job which should be modified * IN new_wckey - desired wckey name * RET SLURM_SUCCESS or error code */ extern int update_job_wckey(char *module, job_record_t *job_ptr, char *new_wckey) { slurmdb_wckey_rec_t wckey_rec, *wckey_ptr; if ((!IS_JOB_PENDING(job_ptr)) || (job_ptr->details == NULL)) { info("%s: attempt to modify account for non-pending %pJ", module, job_ptr); return ESLURM_JOB_NOT_PENDING; } memset(&wckey_rec, 0, sizeof(wckey_rec)); wckey_rec.uid = job_ptr->user_id; wckey_rec.name = new_wckey; if (assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, accounting_enforce, &wckey_ptr, false)) { info("%s: invalid wckey %s for %pJ", module, new_wckey, job_ptr); return ESLURM_INVALID_WCKEY; } else if (association_based_accounting && !wckey_ptr && !(accounting_enforce & ACCOUNTING_ENFORCE_WCKEYS)) { /* if not enforcing associations we want to look for the default account and use it to avoid getting trash in the accounting records. */ wckey_rec.name = NULL; assoc_mgr_fill_in_wckey(acct_db_conn, &wckey_rec, accounting_enforce, &wckey_ptr, false); if (!wckey_ptr) { debug("%s: we didn't have a wckey record for wckey " "'%s' and user '%u', and we can't seem to find " "a default one either. Setting it anyway. " "This will produce trash in accounting. " "If this is not what you desire please put " "AccountStorageEnforce=wckeys in your slurm.conf " "file.", module, new_wckey, job_ptr->user_id); wckey_rec.name = new_wckey; } } xfree(job_ptr->wckey); if (wckey_rec.name && wckey_rec.name[0] != '\0') { job_ptr->wckey = xstrdup(wckey_rec.name); info("%s: setting wckey to %s for %pJ", module, wckey_rec.name, job_ptr); } else { info("%s: cleared wckey for %pJ", module, job_ptr); } last_job_update = time(NULL); return SLURM_SUCCESS; } extern int send_jobs_to_accounting(void) { ListIterator itr = NULL; job_record_t *job_ptr; slurmctld_lock_t job_write_lock = { NO_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK, NO_LOCK }; /* send jobs in pending or running state */ lock_slurmctld(job_write_lock); itr = list_iterator_create(job_list); while ((job_ptr = list_next(itr))) { if (!job_ptr->assoc_id) { slurmdb_assoc_rec_t assoc_rec; memset(&assoc_rec, 0, sizeof(assoc_rec)); assoc_rec.acct = job_ptr->account; if (job_ptr->part_ptr) assoc_rec.partition = job_ptr->part_ptr->name; assoc_rec.uid = job_ptr->user_id; if (assoc_mgr_fill_in_assoc( acct_db_conn, &assoc_rec, accounting_enforce, &job_ptr->assoc_ptr, false) && (accounting_enforce & ACCOUNTING_ENFORCE_ASSOCS) && (!IS_JOB_FINISHED(job_ptr))) { _job_fail_account(job_ptr, __func__); continue; } else job_ptr->assoc_id = assoc_rec.id; } /* we only want active, un accounted for jobs */ if (job_ptr->db_index || IS_JOB_FINISHED(job_ptr)) continue; debug("first reg: starting %pJ in accounting", job_ptr); jobacct_storage_g_job_start(acct_db_conn, job_ptr); if (IS_JOB_SUSPENDED(job_ptr)) jobacct_storage_g_job_suspend(acct_db_conn, job_ptr); } list_iterator_destroy(itr); unlock_slurmctld(job_write_lock); return SLURM_SUCCESS; } /* * copy_job_record_to_job_desc - construct a job_desc_msg_t for a job. * IN job_ptr - the job record * RET the job_desc_msg_t, NULL on error */ extern job_desc_msg_t *copy_job_record_to_job_desc(job_record_t *job_ptr) { job_desc_msg_t *job_desc; struct job_details *details = job_ptr->details; multi_core_data_t *mc_ptr = details->mc_ptr; int i; /* construct a job_desc_msg_t from job */ job_desc = xmalloc(sizeof(job_desc_msg_t)); job_desc->account = xstrdup(job_ptr->account); job_desc->acctg_freq = xstrdup(details->acctg_freq); job_desc->alloc_node = xstrdup(job_ptr->alloc_node); /* Since the allocating salloc or srun is not expected to exist * when this checkpointed job is restarted, do not save these: * * job_desc->alloc_resp_port = job_ptr->alloc_resp_port; * job_desc->alloc_sid = job_ptr->alloc_sid; */ job_desc->argc = details->argc; job_desc->argv = xcalloc(job_desc->argc, sizeof(char *)); for (i = 0; i < job_desc->argc; i ++) job_desc->argv[i] = xstrdup(details->argv[i]); job_desc->begin_time = details->begin_time; job_desc->bitflags = job_ptr->bit_flags; job_desc->clusters = xstrdup(job_ptr->clusters); job_desc->comment = xstrdup(job_ptr->comment); job_desc->contiguous = details->contiguous; job_desc->core_spec = details->core_spec; job_desc->cpu_bind = xstrdup(details->cpu_bind); job_desc->cpu_bind_type = details->cpu_bind_type; job_desc->cpu_freq_min = details->cpu_freq_min; job_desc->cpu_freq_max = details->cpu_freq_max; job_desc->cpu_freq_gov = details->cpu_freq_gov; job_desc->deadline = job_ptr->deadline; job_desc->dependency = xstrdup(details->dependency); job_desc->end_time = 0; /* Unused today */ job_desc->environment = get_job_env(job_ptr, &job_desc->env_size); job_desc->exc_nodes = xstrdup(details->exc_nodes); job_desc->features = xstrdup(details->features); job_desc->cluster_features = xstrdup(details->cluster_features); job_desc->group_id = job_ptr->group_id; job_desc->immediate = 0; /* nowhere to get this value */ job_desc->job_id = job_ptr->job_id; job_desc->kill_on_node_fail = job_ptr->kill_on_node_fail; job_desc->licenses = xstrdup(job_ptr->licenses); job_desc->mail_type = job_ptr->mail_type; job_desc->mail_user = xstrdup(job_ptr->mail_user); job_desc->mcs_label = xstrdup(job_ptr->mcs_label); job_desc->mem_bind = xstrdup(details->mem_bind); job_desc->mem_bind_type = details->mem_bind_type; job_desc->name = xstrdup(job_ptr->name); job_desc->network = xstrdup(job_ptr->network); job_desc->nice = details->nice; job_desc->num_tasks = details->num_tasks; job_desc->open_mode = details->open_mode; job_desc->origin_cluster = xstrdup(job_ptr->origin_cluster); job_desc->other_port = job_ptr->other_port; job_desc->power_flags = job_ptr->power_flags; job_desc->overcommit = details->overcommit; job_desc->partition = xstrdup(job_ptr->partition); job_desc->plane_size = details->plane_size; job_desc->priority = job_ptr->priority; if (job_ptr->qos_ptr) job_desc->qos = xstrdup(job_ptr->qos_ptr->name); job_desc->resp_host = xstrdup(job_ptr->resp_host); job_desc->req_nodes = xstrdup(details->req_nodes); job_desc->requeue = details->requeue; job_desc->reservation = xstrdup(job_ptr->resv_name); job_desc->restart_cnt = job_ptr->restart_cnt; job_desc->script_buf = get_job_script(job_ptr); if (details->share_res == 1) job_desc->shared = JOB_SHARED_OK; else if (details->whole_node == WHOLE_NODE_REQUIRED) job_desc->shared = JOB_SHARED_NONE; else if (details->whole_node == WHOLE_NODE_USER) job_desc->shared = JOB_SHARED_USER; else if (details->whole_node == WHOLE_NODE_MCS) job_desc->shared = JOB_SHARED_MCS; else job_desc->shared = NO_VAL16; job_desc->spank_job_env_size = job_ptr->spank_job_env_size; job_desc->spank_job_env = xcalloc(job_desc->spank_job_env_size, sizeof(char *)); for (i = 0; i < job_desc->spank_job_env_size; i ++) job_desc->spank_job_env[i]= xstrdup(job_ptr->spank_job_env[i]); job_desc->std_err = xstrdup(details->std_err); job_desc->std_in = xstrdup(details->std_in); job_desc->std_out = xstrdup(details->std_out); job_desc->task_dist = details->task_dist; job_desc->time_limit = job_ptr->time_limit; job_desc->time_min = job_ptr->time_min; job_desc->user_id = job_ptr->user_id; job_desc->wait_all_nodes = job_ptr->wait_all_nodes; job_desc->warn_flags = job_ptr->warn_flags; job_desc->warn_signal = job_ptr->warn_signal; job_desc->warn_time = job_ptr->warn_time; job_desc->wckey = xstrdup(job_ptr->wckey); job_desc->work_dir = xstrdup(details->work_dir); job_desc->pn_min_cpus = details->pn_min_cpus; job_desc->pn_min_memory = details->pn_min_memory; job_desc->pn_min_tmp_disk = details->pn_min_tmp_disk; job_desc->min_cpus = details->min_cpus; job_desc->max_cpus = details->max_cpus; job_desc->min_nodes = details->min_nodes; job_desc->max_nodes = details->max_nodes; if (job_desc->max_nodes == 0) /* set 0 in _job_create() */ job_desc->max_nodes = NO_VAL; job_desc->sockets_per_node = mc_ptr->sockets_per_node; job_desc->cores_per_socket = mc_ptr->cores_per_socket; job_desc->threads_per_core = mc_ptr->threads_per_core; job_desc->cpus_per_task = details->cpus_per_task; job_desc->ntasks_per_node = details->ntasks_per_node; job_desc->ntasks_per_socket = mc_ptr->ntasks_per_socket; job_desc->ntasks_per_core = mc_ptr->ntasks_per_core; job_desc->cpus_per_tres = xstrdup(job_ptr->cpus_per_tres); job_desc->mem_per_tres = xstrdup(job_ptr->mem_per_tres); job_desc->tres_bind = xstrdup(job_ptr->tres_bind); job_desc->tres_freq = xstrdup(job_ptr->tres_freq); job_desc->tres_per_job = xstrdup(job_ptr->tres_per_job); job_desc->tres_per_node = xstrdup(job_ptr->tres_per_node); job_desc->tres_per_socket = xstrdup(job_ptr->tres_per_socket); job_desc->tres_per_task = xstrdup(job_ptr->tres_per_task); if (job_ptr->fed_details) { job_desc->fed_siblings_active = job_ptr->fed_details->siblings_active; job_desc->fed_siblings_viable = job_ptr->fed_details->siblings_viable; } return job_desc; } /* Build a bitmap of nodes completing this job */ extern void build_cg_bitmap(job_record_t *job_ptr) { FREE_NULL_BITMAP(job_ptr->node_bitmap_cg); if (job_ptr->node_bitmap) { job_ptr->node_bitmap_cg = bit_copy(job_ptr->node_bitmap); if (bit_set_count(job_ptr->node_bitmap_cg) == 0) job_ptr->job_state &= (~JOB_COMPLETING); } else { error("build_cg_bitmap: node_bitmap is NULL"); job_ptr->node_bitmap_cg = bit_alloc(node_record_count); job_ptr->job_state &= (~JOB_COMPLETING); } } /* job_hold_requeue() * * Requeue the job based upon its current state. * If JOB_SPECIAL_EXIT then requeue and hold with JOB_SPECIAL_EXIT state. * If JOB_REQUEUE_HOLD then requeue and hold. * If JOB_REQUEUE then requeue and let it run again. * The requeue can happen directly from job_requeue() or from * job_epilog_complete() after the last component has finished. * * RET returns true if the job was requeued */ extern bool job_hold_requeue(job_record_t *job_ptr) { uint32_t state; uint32_t flags; job_record_t *base_job_ptr = NULL; xassert(job_ptr); /* If the job is already pending it was * eventually requeued somewhere else. */ if (IS_JOB_PENDING(job_ptr) && !IS_JOB_REVOKED(job_ptr)) return false; /* If the job is not on the origin cluster, then don't worry about * requeueing the job here. The exit code will be sent the origin * cluster and the origin cluster will decide if the job should be * requeued or not. */ if (!fed_mgr_is_origin_job(job_ptr)) return false; /* * A job may be canceled during its epilog in which case we need to * check that the job (or base job in the case of an array) was not * canceled before attemping to requeue. */ if (IS_JOB_CANCELLED(job_ptr) || (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) && (base_job_ptr = find_job_record(job_ptr->array_job_id)) && base_job_ptr->array_recs && IS_JOB_CANCELLED(base_job_ptr))) return false; /* Check if the job exit with one of the * configured requeue values. */ _set_job_requeue_exit_value(job_ptr); state = job_ptr->job_state; if (! (state & JOB_REQUEUE)) return false; /* Sent event requeue to the database. */ if (!(job_ptr->bit_flags & TRES_STR_CALC) && job_ptr->tres_alloc_cnt && (job_ptr->tres_alloc_cnt[TRES_ENERGY] != NO_VAL64)) set_job_tres_alloc_str(job_ptr, false); jobacct_storage_g_job_complete(acct_db_conn, job_ptr); debug("%s: %pJ state 0x%x", __func__, job_ptr, state); /* Set the job pending */ flags = job_ptr->job_state & JOB_STATE_FLAGS; job_ptr->job_state = JOB_PENDING | flags; job_ptr->restart_cnt++; /* clear signal sent flag on requeue */ job_ptr->warn_flags &= ~WARN_SENT; /* * Test if user wants to requeue the job * in hold or with a special exit value. */ if (state & JOB_SPECIAL_EXIT) { /* * JOB_SPECIAL_EXIT means requeue the job, * put it on hold and display state as JOB_SPECIAL_EXIT. */ job_ptr->job_state |= JOB_SPECIAL_EXIT; job_ptr->state_reason = WAIT_HELD_USER; debug("%s: Holding %pJ, special exit", __func__, job_ptr); job_ptr->priority = 0; } job_ptr->job_state &= ~JOB_REQUEUE; /* * Mark array as requeued. Exit codes have already been handled in * _job_array_comp() */ if (((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) && (base_job_ptr = find_job_record(job_ptr->array_job_id)) && base_job_ptr->array_recs) { base_job_ptr->array_recs->array_flags |= ARRAY_TASK_REQUEUED; } debug("%s: %pJ state 0x%x reason %u priority %d", __func__, job_ptr, job_ptr->job_state, job_ptr->state_reason, job_ptr->priority); return true; } static void _parse_max_depend_depth(char *str) { int i = atoi(str); if (i < 0) error("ignoring max_depend_depth value of %d", i); else max_depend_depth = i; } extern void init_depend_policy(void) { char *depend_params = slurm_get_dependency_params(); char *sched_params = slurm_get_sched_params(); char *tmp_ptr; disable_remote_singleton = (xstrcasestr(depend_params, "disable_remote_singleton")) ? true : false; /* * kill_invalid_depend and max_depend_depth are moving from * SchedulerParameters to DependencyParameters. Support both for 20.02, * then remove them from SchedulerParameters in a future release. */ if (xstrcasestr(sched_params, "kill_invalid_depend")) { info("kill_invalid_depend is deprecated in SchedulerParameters and moved to DependencyParameters"); kill_invalid_dep = true; } else kill_invalid_dep = (xstrcasestr(depend_params, "kill_invalid_depend")) ? true : false; /* 01234567890123456 */ if ((tmp_ptr = xstrcasestr(depend_params, "max_depend_depth="))) _parse_max_depend_depth(tmp_ptr + 17); else if ((tmp_ptr = xstrcasestr(sched_params, "max_depend_depth="))) { info("max_depend_depth is deprecated in SchedulerParameters and moved to DependencyParameters"); _parse_max_depend_depth(tmp_ptr + 17); } else max_depend_depth = 10; xfree(depend_params); xfree(sched_params); if (slurmctld_conf.debug_flags & DEBUG_FLAG_DEPENDENCY) info("%s: kill_invalid_depend is set to %d; disable_remote_singleton is set to %d; max_depend_depth is set to %d", __func__, kill_invalid_dep, disable_remote_singleton, max_depend_depth); else debug2("%s: kill_invalid_depend is set to %d; disable_remote_singleton is set to %d; max_depend_depth is set to %d", __func__, kill_invalid_dep, disable_remote_singleton, max_depend_depth); } /* init_requeue_policy() * Initialize the requeue exit/hold bitmaps. */ extern void init_requeue_policy(void) { /* clean first as we can be reconfiguring */ FREE_NULL_BITMAP(requeue_exit); FREE_NULL_BITMAP(requeue_exit_hold); requeue_exit = _make_requeue_array(slurmctld_conf.requeue_exit); requeue_exit_hold = _make_requeue_array( slurmctld_conf.requeue_exit_hold); } /* _make_requeue_array() * * Process the RequeueExit|RequeueExitHold configuration * parameters creating two bitmaps holding the exit values * of jobs for which they have to be requeued. */ static bitstr_t *_make_requeue_array(char *conf_buf) { hostset_t hs; bitstr_t *bs = NULL; char *tok = NULL, *end_ptr = NULL; long val; if (conf_buf == NULL) return bs; xstrfmtcat(tok, "[%s]", conf_buf); hs = hostset_create(tok); xfree(tok); if (!hs) { error("%s: exit values: %s", __func__, conf_buf); return bs; } debug("%s: exit values: %s", __func__, conf_buf); bs = bit_alloc(MAX_EXIT_VAL + 1); while ((tok = hostset_shift(hs))) { val = strtol(tok, &end_ptr, 10); if ((end_ptr[0] == '\0') && (val >= 0) && (val <= MAX_EXIT_VAL)) { bit_set(bs, val); } else { error("%s: exit values: %s (%s)", __func__, conf_buf, tok); } free(tok); } hostset_destroy(hs); return bs; } /* _set_job_requeue_exit_value() * * Compared the job exit values with the configured * RequeueExit and RequeueHoldExit and a match is * found, set the appropriate state for job_hold_requeue() */ static void _set_job_requeue_exit_value(job_record_t *job_ptr) { int exit_code; exit_code = WEXITSTATUS(job_ptr->exit_code); if ((exit_code < 0) || (exit_code > MAX_EXIT_VAL)) return; if (requeue_exit && bit_test(requeue_exit, exit_code)) { debug2("%s: %pJ exit code %d state JOB_REQUEUE", __func__, job_ptr, exit_code); job_ptr->job_state |= JOB_REQUEUE; return; } if (requeue_exit_hold && bit_test(requeue_exit_hold, exit_code)) { /* Not sure if want to set special exit state in this case */ debug2("%s: %pJ exit code %d state JOB_SPECIAL_EXIT", __func__, job_ptr, exit_code); job_ptr->job_state |= JOB_REQUEUE; job_ptr->job_state |= JOB_SPECIAL_EXIT; return; } } /* * Reset a job's end_time based upon it's start_time and time_limit. * NOTE: Do not reset the end_time if already being preempted */ extern void job_end_time_reset(job_record_t *job_ptr) { if (job_ptr->preempt_time) return; /* Preemption in progress */ if (job_ptr->time_limit == INFINITE) { job_ptr->end_time = job_ptr->start_time + (365 * 24 * 60 * 60); /* secs in year */ } else { job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); /* secs */ } job_ptr->end_time_exp = job_ptr->end_time; } /* trace_job() - print the job details if * the DEBUG_FLAG_TRACE_JOBS is set */ extern void trace_job(job_record_t *job_ptr, const char *func, const char *extra) { if (slurmctld_conf.debug_flags & DEBUG_FLAG_TRACE_JOBS) { info("%s: %s %pJ", func, extra, job_ptr); } } /* If this is a job array meta-job, prepare it for being scheduled */ extern void job_array_pre_sched(job_record_t *job_ptr) { int32_t i; if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap) return; i = bit_ffs(job_ptr->array_recs->task_id_bitmap); if (i < 0) { /* This happens if the final task in a meta-job is requeued */ if (job_ptr->restart_cnt == 0) { error("%pJ has empty task_id_bitmap", job_ptr); } FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap); return; } job_ptr->array_job_id = job_ptr->job_id; job_ptr->array_task_id = i; } /* If this is a job array meta-job, clean up after scheduling attempt */ extern job_record_t *job_array_post_sched(job_record_t *job_ptr) { job_record_t *new_job_ptr = NULL; if (!job_ptr->array_recs || !job_ptr->array_recs->task_id_bitmap) return job_ptr; if (job_ptr->array_recs->task_cnt <= 1) { /* Preserve array_recs for min/max exit codes for job array */ if (job_ptr->array_recs->task_cnt) { job_ptr->array_recs->task_cnt--; } else if (job_ptr->restart_cnt) { /* Last task of a job array has been requeued */ } else { error("job %pJ array_recs task count underflow", job_ptr); } xfree(job_ptr->array_recs->task_id_str); if (job_ptr->array_recs->task_cnt == 0) FREE_NULL_BITMAP(job_ptr->array_recs->task_id_bitmap); /* While it is efficient to set the db_index to 0 here * to get the database to update the record for * pending tasks it also creates a window in which if * the association id is changed (different account or * partition) instead of returning the previous * db_index (expected) it would create a new one * leaving the other orphaned. Setting the job_state * sets things up so the db_index isn't lost but the * start message is still sent to get the desired behavior. */ if (job_ptr->db_index) job_ptr->job_state |= JOB_UPDATE_DB; /* If job is requeued, it will already be in the hash table */ if (!find_job_array_rec(job_ptr->array_job_id, job_ptr->array_task_id)) { _add_job_array_hash(job_ptr); } new_job_ptr = job_ptr; } else { new_job_ptr = job_array_split(job_ptr); if (new_job_ptr) { new_job_ptr->job_state = JOB_PENDING; new_job_ptr->start_time = (time_t) 0; /* Do NOT set the JOB_UPDATE_DB flag here, it * is handled when task_id_str is created elsewhere */ } else { error("%s: Unable to copy record for %pJ", __func__, job_ptr); } } return new_job_ptr; } /* _kill_dependent() * * Exterminate the job that has invalid dependency * condition. */ static void _kill_dependent(job_record_t *job_ptr) { time_t now = time(NULL); info("%s: Job dependency can't be satisfied, cancelling %pJ", __func__, job_ptr); job_ptr->job_state = JOB_CANCELLED; job_ptr->start_time = now; job_ptr->end_time = now; job_completion_logger(job_ptr, false); last_job_update = now; srun_allocate_abort(job_ptr); } static job_fed_details_t *_dup_job_fed_details(job_fed_details_t *src) { job_fed_details_t *dst = NULL; if (!src) return NULL; dst = xmalloc(sizeof(job_fed_details_t)); memcpy(dst, src, sizeof(job_fed_details_t)); dst->origin_str = xstrdup(src->origin_str); dst->siblings_active_str = xstrdup(src->siblings_active_str); dst->siblings_viable_str = xstrdup(src->siblings_viable_str); return dst; } extern void free_job_fed_details(job_fed_details_t **fed_details_pptr) { job_fed_details_t *fed_details_ptr = *fed_details_pptr; if (fed_details_ptr) { xfree(fed_details_ptr->origin_str); xfree(fed_details_ptr->siblings_active_str); xfree(fed_details_ptr->siblings_viable_str); xfree(fed_details_ptr); *fed_details_pptr = NULL; } } static void _dump_job_fed_details(job_fed_details_t *fed_details_ptr, Buf buffer) { if (fed_details_ptr) { pack16(1, buffer); pack32(fed_details_ptr->cluster_lock, buffer); packstr(fed_details_ptr->origin_str, buffer); pack64(fed_details_ptr->siblings_active, buffer); packstr(fed_details_ptr->siblings_active_str, buffer); pack64(fed_details_ptr->siblings_viable, buffer); packstr(fed_details_ptr->siblings_viable_str, buffer); } else { pack16(0, buffer); } } static int _load_job_fed_details(job_fed_details_t **fed_details_pptr, Buf buffer, uint16_t protocol_version) { uint16_t tmp_uint16; uint32_t tmp_uint32; job_fed_details_t *fed_details_ptr = NULL; xassert(fed_details_pptr); if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) { safe_unpack16(&tmp_uint16, buffer); if (tmp_uint16) { *fed_details_pptr = xmalloc(sizeof(job_fed_details_t)); fed_details_ptr = *fed_details_pptr; safe_unpack32(&fed_details_ptr->cluster_lock, buffer); safe_unpackstr_xmalloc(&fed_details_ptr->origin_str, &tmp_uint32, buffer); safe_unpack64(&fed_details_ptr->siblings_active, buffer); safe_unpackstr_xmalloc( &fed_details_ptr->siblings_active_str, &tmp_uint32, buffer); safe_unpack64(&fed_details_ptr->siblings_viable, buffer); safe_unpackstr_xmalloc( &fed_details_ptr->siblings_viable_str, &tmp_uint32, buffer); } } else goto unpack_error; return SLURM_SUCCESS; unpack_error: free_job_fed_details(fed_details_pptr); *fed_details_pptr = NULL; return SLURM_ERROR; } /* Set federated job's sibling strings. */ extern void update_job_fed_details(job_record_t *job_ptr) { xassert(job_ptr); xassert(job_ptr->fed_details); xfree(job_ptr->fed_details->siblings_active_str); xfree(job_ptr->fed_details->siblings_viable_str); job_ptr->fed_details->siblings_active_str = fed_mgr_cluster_ids_to_names( job_ptr->fed_details->siblings_active); job_ptr->fed_details->siblings_viable_str = fed_mgr_cluster_ids_to_names( job_ptr->fed_details->siblings_viable); /* only set once */ if (!job_ptr->fed_details->origin_str) job_ptr->fed_details->origin_str = fed_mgr_get_cluster_name( fed_mgr_get_cluster_id(job_ptr->job_id)); } /* * Set the allocation response with the current cluster's information and the * job's allocated node's addr's if the allocation is being filled by a cluster * other than the cluster that submitted the job * * Note: make sure that the resp's working_cluster_rec is NULL'ed out before the * resp is free'd since it points to global memory. * * IN resp - allocation response being sent back to client. * IN job_ptr - allocated job * IN req_cluster - the cluster requsting the allocation info. */ extern void set_remote_working_response( resource_allocation_response_msg_t *resp, job_record_t *job_ptr, const char *req_cluster) { xassert(resp); xassert(job_ptr); if (job_ptr->node_cnt && req_cluster && slurmctld_conf.cluster_name && xstrcmp(slurmctld_conf.cluster_name, req_cluster)) { if (job_ptr->fed_details && fed_mgr_cluster_rec) { resp->working_cluster_rec = fed_mgr_cluster_rec; } else { resp->working_cluster_rec = response_cluster_rec; } resp->node_addr = xcalloc(job_ptr->node_cnt, sizeof(slurm_addr_t)); memcpy(resp->node_addr, job_ptr->node_addr, (sizeof(slurm_addr_t) * job_ptr->node_cnt)); } } /* Build structure with job allocation details */ extern resource_allocation_response_msg_t *build_job_info_resp( job_record_t *job_ptr) { resource_allocation_response_msg_t *job_info_resp_msg; int i, j; job_info_resp_msg = xmalloc(sizeof(resource_allocation_response_msg_t)); if (!job_ptr->job_resrcs) { ; } else if (bit_equal(job_ptr->node_bitmap, job_ptr->job_resrcs->node_bitmap)) { job_info_resp_msg->num_cpu_groups = job_ptr->job_resrcs->cpu_array_cnt; job_info_resp_msg->cpu_count_reps = xcalloc(job_ptr->job_resrcs->cpu_array_cnt, sizeof(uint32_t)); memcpy(job_info_resp_msg->cpu_count_reps, job_ptr->job_resrcs->cpu_array_reps, (sizeof(uint32_t) * job_ptr->job_resrcs->cpu_array_cnt)); job_info_resp_msg->cpus_per_node = xcalloc(job_ptr->job_resrcs->cpu_array_cnt, sizeof(uint16_t)); memcpy(job_info_resp_msg->cpus_per_node, job_ptr->job_resrcs->cpu_array_value, (sizeof(uint16_t) * job_ptr->job_resrcs->cpu_array_cnt)); } else { /* Job has changed size, rebuild CPU count info */ job_info_resp_msg->num_cpu_groups = job_ptr->node_cnt; job_info_resp_msg->cpu_count_reps = xcalloc(job_ptr->node_cnt, sizeof(uint32_t)); job_info_resp_msg->cpus_per_node = xcalloc(job_ptr->node_cnt, sizeof(uint32_t)); for (i = 0, j = -1; i < job_ptr->job_resrcs->nhosts; i++) { if (job_ptr->job_resrcs->cpus[i] == 0) continue; if ((j == -1) || (job_info_resp_msg->cpus_per_node[j] != job_ptr->job_resrcs->cpus[i])) { j++; job_info_resp_msg->cpus_per_node[j] = job_ptr->job_resrcs->cpus[i]; job_info_resp_msg->cpu_count_reps[j] = 1; } else { job_info_resp_msg->cpu_count_reps[j]++; } } job_info_resp_msg->num_cpu_groups = j + 1; } job_info_resp_msg->account = xstrdup(job_ptr->account); job_info_resp_msg->alias_list = xstrdup(job_ptr->alias_list); job_info_resp_msg->job_id = job_ptr->job_id; job_info_resp_msg->node_cnt = job_ptr->node_cnt; job_info_resp_msg->node_list = xstrdup(job_ptr->nodes); job_info_resp_msg->partition = xstrdup(job_ptr->partition); if (job_ptr->qos_ptr) { slurmdb_qos_rec_t *qos; qos = (slurmdb_qos_rec_t *)job_ptr->qos_ptr; job_info_resp_msg->qos = xstrdup(qos->name); } job_info_resp_msg->resv_name = xstrdup(job_ptr->resv_name); job_info_resp_msg->select_jobinfo = select_g_select_jobinfo_copy(job_ptr->select_jobinfo); if (job_ptr->details) { if (job_ptr->bit_flags & JOB_MEM_SET) { job_info_resp_msg->pn_min_memory = job_ptr->details->pn_min_memory; } if (job_ptr->details->mc_ptr) { job_info_resp_msg->ntasks_per_board = job_ptr->details->mc_ptr->ntasks_per_board; job_info_resp_msg->ntasks_per_core = job_ptr->details->mc_ptr->ntasks_per_core; job_info_resp_msg->ntasks_per_socket = job_ptr->details->mc_ptr->ntasks_per_socket; } } else { /* job_info_resp_msg->pn_min_memory = 0; */ job_info_resp_msg->ntasks_per_board = NO_VAL16; job_info_resp_msg->ntasks_per_core = NO_VAL16; job_info_resp_msg->ntasks_per_socket = NO_VAL16; } if (job_ptr->details && job_ptr->details->env_cnt) { job_info_resp_msg->env_size = job_ptr->details->env_cnt; job_info_resp_msg->environment = xcalloc(job_info_resp_msg->env_size + 1, sizeof(char *)); for (i = 0; i < job_info_resp_msg->env_size; i++) { job_info_resp_msg->environment[i] = xstrdup(job_ptr->details->env_sup[i]); } job_info_resp_msg->environment[i] = NULL; } return job_info_resp_msg; } /* * Calculate billable TRES based on partition's defined BillingWeights. If none * is defined, return total_cpus. This is cached on job_ptr->billable_tres and * is updated if the job was resized since the last iteration. * * IN job_ptr - job to calc billable tres on * IN start_time - time the has started or been resized * IN assoc_mgr_locked - whether the tres assoc lock is set or not */ extern double calc_job_billable_tres(job_record_t *job_ptr, time_t start_time, bool assoc_mgr_locked) { xassert(job_ptr); part_record_t *part_ptr = job_ptr->part_ptr; /* We don't have any resources allocated, just return 0. */ if (!job_ptr->tres_alloc_cnt) return 0; /* Don't recalculate unless the job is new or resized */ if ((!fuzzy_equal(job_ptr->billable_tres, NO_VAL)) && difftime(job_ptr->resize_time, start_time) < 0.0) return job_ptr->billable_tres; if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO) info("BillingWeight: %pJ is either new or it was resized", job_ptr); /* No billing weights defined. Return CPU count */ if (!part_ptr || !part_ptr->billing_weights) { job_ptr->billable_tres = job_ptr->total_cpus; return job_ptr->billable_tres; } if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO) info("BillingWeight: %pJ using \"%s\" from partition %s", job_ptr, part_ptr->billing_weights_str, job_ptr->part_ptr->name); job_ptr->billable_tres = assoc_mgr_tres_weighted(job_ptr->tres_alloc_cnt, part_ptr->billing_weights, slurmctld_conf.priority_flags, assoc_mgr_locked); if (slurmctld_conf.debug_flags & DEBUG_FLAG_PRIO) info("BillingWeight: %pJ %s = %f", job_ptr, (slurmctld_conf.priority_flags & PRIORITY_FLAGS_MAX_TRES) ? "MAX(node TRES) + SUM(Global TRES)" : "SUM(TRES)", job_ptr->billable_tres); return job_ptr->billable_tres; } extern void update_job_limit_set_tres(uint16_t **limits_pptr) { int i, old_pos; int new_size = sizeof(uint16_t) * slurmctld_tres_cnt; xassert(limits_pptr); *limits_pptr = xrealloc(*limits_pptr, new_size); if (assoc_mgr_tres_pos_changed()) { uint16_t *limits_ptr, tmp_tres[slurmctld_tres_cnt]; limits_ptr = *limits_pptr; for (i = 0; i < slurmctld_tres_cnt; i++) { if ((old_pos = assoc_mgr_get_old_tres_pos(i)) == -1) tmp_tres[i] = 0; else tmp_tres[i] = limits_ptr[old_pos]; } memcpy(limits_ptr, tmp_tres, new_size); } } /* * Send warning signal to job before end time. * * IN job_ptr - job to send warn signal to. * IN ignore_time - If set, ignore the warn time and just send it. */ extern void send_job_warn_signal(job_record_t *job_ptr, bool ignore_time) { if (job_ptr->warn_signal && !(job_ptr->warn_flags & WARN_SENT) && (ignore_time || (job_ptr->warn_time && ((job_ptr->warn_time + PERIODIC_TIMEOUT + time(NULL)) >= job_ptr->end_time)))) { /* * If --signal B option was not specified, * signal only the steps but not the batch step. */ if (!(job_ptr->warn_flags & KILL_JOB_BATCH)) job_ptr->warn_flags |= KILL_STEPS_ONLY; debug("%s: warning signal %u to %pJ", __func__, job_ptr->warn_signal, job_ptr); job_signal(job_ptr, job_ptr->warn_signal, job_ptr->warn_flags, 0, false); /* mark job as signaled */ job_ptr->warn_flags |= WARN_SENT; } } static int _overlap_and_running_internal(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *)x; job_overlap_args_t *overlap_args = (job_overlap_args_t *)arg; /* We always break if we find something not running */ if ((!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr))) { overlap_args->rc = 0; return 1; } /* * We are just looking for something overlapping. On a hetjob we need * to check everything. */ if (job_ptr->node_bitmap && bit_overlap_any(overlap_args->node_map, job_ptr->node_bitmap)) overlap_args->rc = 1; return 0; } extern bool job_overlap_and_running(bitstr_t *node_map, job_record_t *job_ptr) { job_overlap_args_t overlap_args = { .node_map = node_map }; if (!job_ptr->het_job_list) (void)_overlap_and_running_internal(job_ptr, &overlap_args); else (void)list_for_each(job_ptr->het_job_list, _overlap_and_running_internal, &overlap_args); return overlap_args.rc; }