/*****************************************************************************\ * backfill.c - simple backfill scheduler plugin. * * If a partition does not have root only access and nodes are not shared * then raise the priority of pending jobs if doing so does not adversely * effect the expected initiation of any higher priority job. We do not alter * a job's required or excluded node list, so this is a conservative * algorithm. * * For example, consider a cluster "lx[01-08]" with one job executing on * nodes "lx[01-04]". The highest priority pending job requires five nodes * including "lx05". The next highest priority pending job requires any * three nodes. Without explicitly forcing the second job to use nodes * "lx[06-08]", we can't start it without possibly delaying the higher * priority job. ***************************************************************************** * Copyright (C) 2003-2007 The Regents of the University of California. * Copyright (C) 2008-2010 Lawrence Livermore National Security. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Morris Jette * CODE-OCEC-09-009. All rights reserved. * * This file is part of Slurm, a resource management program. * For details, see . * Please also read the included file: DISCLAIMER. * * Slurm is free software; you can redistribute it and/or modify it under * the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * In addition, as a special exception, the copyright holders give permission * to link the code of portions of this program with the OpenSSL library under * certain conditions as described in each individual source file, and * distribute linked combinations including the two. You must obey the GNU * General Public License in all respects for all of the code used other than * OpenSSL. If you modify file(s) with this exception, you may extend this * exception to your version of the file(s), but you are not obligated to do * so. If you do not wish to do so, delete this exception statement from your * version. If you delete this exception statement from all source files in * the program, then also delete it here. * * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License along * with Slurm; if not, write to the Free Software Foundation, Inc., * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. \*****************************************************************************/ #include "config.h" #if HAVE_SYS_PRCTL_H # include #endif #include #include #include #include #include #include #include "slurm/slurm.h" #include "slurm/slurmdb.h" #include "slurm/slurm_errno.h" #include "src/common/assoc_mgr.h" #include "src/common/gres.h" #include "src/common/list.h" #include "src/common/macros.h" #include "src/common/node_features.h" #include "src/common/node_select.h" #include "src/common/parse_time.h" #include "src/common/power.h" #include "src/common/read_config.h" #include "src/common/slurm_accounting_storage.h" #include "src/common/slurm_mcs.h" #include "src/common/slurm_protocol_api.h" #include "src/common/xmalloc.h" #include "src/common/xstring.h" #include "src/slurmctld/acct_policy.h" #include "src/slurmctld/burst_buffer.h" #include "src/slurmctld/fed_mgr.h" #include "src/slurmctld/front_end.h" #include "src/slurmctld/job_scheduler.h" #include "src/slurmctld/licenses.h" #include "src/slurmctld/locks.h" #include "src/slurmctld/node_scheduler.h" #include "src/slurmctld/preempt.h" #include "src/slurmctld/reservation.h" #include "src/slurmctld/slurmctld.h" #include "src/slurmctld/srun_comm.h" #include "backfill.h" #define BACKFILL_INTERVAL 30 #define BACKFILL_RESOLUTION 60 #define BACKFILL_WINDOW (24 * 60 * 60) #define BF_MAX_JOB_ARRAY_RESV 20 #define SLURMCTLD_THREAD_LIMIT 5 #define YIELD_INTERVAL 2000000 /* time in micro-seconds */ #define YIELD_SLEEP 500000; /* time in micro-seconds */ #define MAX_BACKFILL_INTERVAL 10800 /* 3 hours */ #define MAX_BACKFILL_RESOLUTION 3600 /* 1 hour */ #define MAX_BACKFILL_WINDOW (30 * 24 * 60 * 60) /* 30 days */ #define MAX_BF_JOB_PART_COUNT_RESERVE 100000 #define MAX_BF_MAX_JOB_ARRAY_RESV 1000 #define MAX_BF_MAX_JOB_START 10000 #define MAX_BF_MAX_JOB_TEST 1000000 #define MAX_BF_MAX_TIME 3600 #define MAX_BF_MIN_AGE_RESERVE (30 * 24 * 60 * 60) /* 30 days */ #define MAX_BF_MIN_PRIO_RESERVE INFINITE #define MAX_BF_YIELD_INTERVAL 10000000 /* 10 seconds in usec */ #define MAX_MAX_RPC_CNT 1000 #define MAX_YIELD_SLEEP 10000000 /* 10 seconds in usec */ #define MAX_BF_MAX_JOB_ASSOC MAX_BF_MAX_JOB_TEST #define MAX_BF_MAX_JOB_USER MAX_BF_MAX_JOB_TEST #define MAX_BF_MAX_JOB_USER_PART MAX_BF_MAX_JOB_TEST #define MAX_BF_MAX_JOB_PART MAX_BF_MAX_JOB_TEST typedef struct node_space_map { time_t begin_time; time_t end_time; bitstr_t *avail_bitmap; int next; /* next record, by time, zero termination */ } node_space_map_t; typedef struct node_space_handler { node_space_map_t *node_space; int *node_space_recs; } node_space_handler_t; /* * HetJob scheduling structures * NOTE: An individial hetjob component can be submitted to multiple * partitions and have different start times in each */ typedef struct het_job_rec { uint32_t job_id; job_record_t *job_ptr; time_t latest_start; /* Time when expected to start */ part_record_t *part_ptr; } het_job_rec_t; typedef struct het_job_map { uint32_t comp_time_limit; /* Time limit for hetjob */ uint32_t het_job_id; List het_job_rec_list; /* List of het_job_rec_t */ time_t prev_start; /* Expected start time from last test */ } het_job_map_t; typedef struct deadlock_job_struct { uint32_t het_job_id; time_t start_time; } deadlock_job_struct_t; typedef struct deadlock_part_struct { List deadlock_job_list; part_record_t *part_ptr; } deadlock_part_struct_t; /* Diagnostic statistics */ extern diag_stats_t slurmctld_diag_stats; uint32_t bf_sleep_usec = 0; typedef struct backfill_user_usage { slurmdb_bf_usage_t bf_usage; uid_t uid; } bf_user_usage_t; /*********************** local variables *********************/ static bool stop_backfill = false; static pthread_mutex_t thread_flag_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t term_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t term_cond = PTHREAD_COND_INITIALIZER; static pthread_mutex_t config_lock = PTHREAD_MUTEX_INITIALIZER; static bool config_flag = false; static uint64_t debug_flags = 0; static int backfill_interval = BACKFILL_INTERVAL; static int bf_max_time = BACKFILL_INTERVAL; static int backfill_resolution = BACKFILL_RESOLUTION; static int backfill_window = BACKFILL_WINDOW; static int bf_job_part_count_reserve = 0; static int bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; static int bf_min_age_reserve = 0; static bool bf_running_job_reserve = false; static uint32_t bf_min_prio_reserve = 0; static List deadlock_global_list; static bool bf_hetjob_immediate = false; static uint16_t bf_hetjob_prio = 0; static bool bf_one_resv_per_job = false; static uint32_t job_start_cnt = 0; static int max_backfill_job_cnt = 100; static int max_backfill_job_per_assoc = 0; static int max_backfill_job_per_part = 0; static int max_backfill_job_per_user = 0; static int max_backfill_job_per_user_part = 0; static int max_backfill_jobs_start = 0; static bool backfill_continue = false; static bool assoc_limit_stop = false; static int max_rpc_cnt = 0; static int yield_interval = YIELD_INTERVAL; static int yield_sleep = YIELD_SLEEP; static List het_job_list = NULL; static xhash_t *user_usage_map = NULL; /* look up user usage when no assoc */ /*********************** local functions *********************/ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, bitstr_t *res_bitmap, node_space_map_t *node_space, int *node_space_recs); static void _adjust_hetjob_prio(uint32_t *prio, uint32_t val); static int _attempt_backfill(void); static int _clear_job_estimates(void *x, void *arg); static int _clear_qos_blocked_times(void *x, void *arg); static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2, int node_space_recs); static uint32_t _get_job_max_tl(job_record_t *job_ptr, time_t now, node_space_map_t *node_space); static bool _hetjob_any_resv(job_record_t *het_leader); static uint32_t _hetjob_calc_prio(job_record_t *het_leader); static uint32_t _hetjob_calc_prio_tier(job_record_t *het_leader); static void _het_job_deadlock_fini(void); static bool _het_job_deadlock_test(job_record_t *job_ptr); static bool _job_part_valid(job_record_t *job_ptr, part_record_t *part_ptr); static void _load_config(void); static bool _many_pending_rpcs(void); static bool _more_work(time_t last_backfill_time); static uint32_t _my_sleep(int64_t usec); static int _num_feature_count(job_record_t *job_ptr, bool *has_xand, bool *has_xor); static int _het_job_find_map(void *x, void *key); static void _het_job_map_del(void *x); static void _het_job_start_clear(void); static time_t _het_job_start_find(job_record_t *job_ptr, time_t now); static void _het_job_start_set(job_record_t *job_ptr, time_t latest_start, uint32_t comp_time_limit); static void _het_job_start_test_single(node_space_map_t *node_space, het_job_map_t *map, bool single); static int _het_job_start_test_list(void *map, void *node_space); static void _het_job_start_test(node_space_map_t *node_space, uint32_t het_job_id); static void _reset_job_time_limit(job_record_t *job_ptr, time_t now, node_space_map_t *node_space); static int _set_hetjob_details(void *x, void *arg); static int _start_job(job_record_t *job_ptr, bitstr_t *avail_bitmap); static bool _test_resv_overlap(node_space_map_t *node_space, bitstr_t *use_bitmap, uint32_t start_time, uint32_t end_reserve); static int _try_sched(job_record_t *job_ptr, bitstr_t **avail_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bitstr_t *exc_core_bitmap); static int _yield_locks(int64_t usec); static void _bf_map_key_id(void *item, const char **key, uint32_t *key_len); static void _bf_map_free(void *item); /* Log resources to be allocated to a pending job */ static void _dump_job_sched(job_record_t *job_ptr, time_t end_time, bitstr_t *avail_bitmap) { char begin_buf[32], end_buf[32], *node_list; slurm_make_time_str(&job_ptr->start_time, begin_buf, sizeof(begin_buf)); slurm_make_time_str(&end_time, end_buf, sizeof(end_buf)); node_list = bitmap2node_name(avail_bitmap); info("%pJ to start at %s, end at %s on nodes %s in partition %s", job_ptr, begin_buf, end_buf, node_list, job_ptr->part_ptr->name); xfree(node_list); } static void _dump_job_test(job_record_t *job_ptr, bitstr_t *avail_bitmap, time_t start_time) { char begin_buf[32], *node_list; if (start_time == 0) strcpy(begin_buf, "NOW"); else slurm_make_time_str(&start_time, begin_buf, sizeof(begin_buf)); node_list = bitmap2node_name(avail_bitmap); info("Test %pJ at %s on %s", job_ptr, begin_buf, node_list); xfree(node_list); } /* Log resource allocate table */ static void _dump_node_space_table(node_space_map_t *node_space_ptr) { int i = 0; char begin_buf[32], end_buf[32], *node_list; info("========================================="); while (1) { slurm_make_time_str(&node_space_ptr[i].begin_time, begin_buf, sizeof(begin_buf)); slurm_make_time_str(&node_space_ptr[i].end_time, end_buf, sizeof(end_buf)); node_list = bitmap2node_name(node_space_ptr[i].avail_bitmap); info("Begin:%s End:%s Nodes:%s", begin_buf, end_buf, node_list); xfree(node_list); if ((i = node_space_ptr[i].next) == 0) break; } info("========================================="); } static void _set_job_time_limit(job_record_t *job_ptr, uint32_t new_limit) { job_ptr->time_limit = new_limit; /* reset flag if we have a NO_VAL time_limit */ if (job_ptr->time_limit == NO_VAL) job_ptr->limit_set.time = 0; } /* * _many_pending_rpcs - Determine if slurmctld is busy with many active RPCs * RET - True if slurmctld currently has more than SLURMCTLD_THREAD_LIMIT * active RPCs */ static bool _many_pending_rpcs(void) { bool many_pending_rpcs = false; slurm_mutex_lock(&slurmctld_config.thread_count_lock); //info("thread_count = %u", slurmctld_config.server_thread_count); if ((max_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= max_rpc_cnt)) many_pending_rpcs = true; slurm_mutex_unlock(&slurmctld_config.thread_count_lock); return many_pending_rpcs; } /* * Report summary of job's feature specification * IN job_ptr - job to schedule * OUT has_xand - true if features are XANDed together * OUT has_xor - true if features are XORed together * RET Total count for ALL job features, even counts with XAND separator */ static int _num_feature_count(job_record_t *job_ptr, bool *has_xand, bool *has_xor) { struct job_details *detail_ptr = job_ptr->details; int rc = 0; ListIterator feat_iter; job_feature_t *feat_ptr; *has_xand = false; *has_xor = false; if (detail_ptr->feature_list == NULL) /* no constraints */ return rc; feat_iter = list_iterator_create(detail_ptr->feature_list); while ((feat_ptr = (job_feature_t *) list_next(feat_iter))) { if (feat_ptr->count) rc++; if (feat_ptr->op_code == FEATURE_OP_XAND) *has_xand = true; if (feat_ptr->op_code == FEATURE_OP_XOR) *has_xor = true; } list_iterator_destroy(feat_iter); return rc; } static int _clear_qos_blocked_times(void *x, void *arg) { slurmdb_qos_rec_t *qos_ptr = (slurmdb_qos_rec_t *) x; qos_ptr->blocked_until = 0; return 0; } /* * Attempt to schedule a specific job on specific available nodes * IN job_ptr - job to schedule * IN/OUT avail_bitmap - nodes available/selected to use * IN exc_core_bitmap - cores which can not be used * RET SLURM_SUCCESS on success, otherwise an error code */ static int _try_sched(job_record_t *job_ptr, bitstr_t **avail_bitmap, uint32_t min_nodes, uint32_t max_nodes, uint32_t req_nodes, bitstr_t *exc_core_bitmap) { bitstr_t *low_bitmap = NULL, *tmp_bitmap = NULL; int rc = SLURM_SUCCESS; bool has_xand = false, has_xor = false; int feat_cnt = _num_feature_count(job_ptr, &has_xand, &has_xor); struct job_details *detail_ptr = job_ptr->details; List feature_cache = detail_ptr->feature_list; List preemptee_candidates = NULL; ListIterator feat_iter; job_feature_t *feat_ptr; job_feature_t *feature_base; if (has_xand || feat_cnt) { /* * Cache the feature information and test the individual * features (or sets of features in parenthesis), one at a time */ time_t high_start = 0; uint32_t feat_min_node; uint32_t feat_node_cnt; tmp_bitmap = bit_copy(*avail_bitmap); feat_iter = list_iterator_create(feature_cache); while ((feat_ptr = (job_feature_t *) list_next(feat_iter)) && (rc == SLURM_SUCCESS)) { detail_ptr->feature_list = list_create(feature_list_delete); feature_base = xmalloc(sizeof(job_feature_t)); feature_base->name = xstrdup(feat_ptr->name); feature_base->op_code = feat_ptr->op_code; list_append(detail_ptr->feature_list, feature_base); feat_min_node = feat_ptr->count; while ((feat_ptr->paren > 0) && ((feat_ptr = (job_feature_t *) list_next(feat_iter)))) { feature_base = xmalloc(sizeof(job_feature_t)); feature_base->name = xstrdup(feat_ptr->name); feature_base->op_code = feat_ptr->op_code; feat_min_node = feat_ptr->count; list_append(detail_ptr->feature_list, feature_base); } feature_base->op_code = FEATURE_OP_END; feat_min_node = MAX(1, feat_min_node); if ((job_req_node_filter(job_ptr, *avail_bitmap, true) == SLURM_SUCCESS) && (bit_set_count(*avail_bitmap) >= feat_min_node)) { preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); rc = select_g_job_test(job_ptr, *avail_bitmap, feat_min_node, max_nodes, feat_min_node, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); FREE_NULL_LIST(preemptee_candidates); if (rc == SLURM_SUCCESS) { if ((high_start == 0) || (high_start < job_ptr->start_time)) high_start = job_ptr->start_time; if (low_bitmap) { bit_or(low_bitmap, *avail_bitmap); } else { low_bitmap = *avail_bitmap; *avail_bitmap = NULL; } } } else { rc = ESLURM_NODES_BUSY; } FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap = bit_copy(tmp_bitmap); if (low_bitmap) bit_and_not(*avail_bitmap, low_bitmap); list_destroy(detail_ptr->feature_list); } list_iterator_destroy(feat_iter); if (low_bitmap) feat_node_cnt = bit_set_count(low_bitmap); else feat_node_cnt = 0; if (feat_node_cnt < req_nodes) { detail_ptr->feature_list = NULL; rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes - feat_node_cnt, max_nodes - feat_node_cnt, req_nodes - feat_node_cnt, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); if (low_bitmap) { bit_or(low_bitmap, *avail_bitmap); } else { low_bitmap = *avail_bitmap; *avail_bitmap = NULL; } } FREE_NULL_BITMAP(tmp_bitmap); if (high_start && rc == SLURM_SUCCESS) { job_ptr->start_time = high_start; FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap = low_bitmap; } else { rc = ESLURM_NODES_BUSY; job_ptr->start_time = 0; FREE_NULL_BITMAP(*avail_bitmap); FREE_NULL_BITMAP(low_bitmap); } /* Restore the original feature information */ detail_ptr->feature_list = feature_cache; } else if (has_xor) { /* * Cache the feature information and test the individual * features (or sets of features in parenthesis), one at a time */ job_feature_t *feature_base; List feature_cache = detail_ptr->feature_list; time_t low_start = 0; tmp_bitmap = bit_copy(*avail_bitmap); feat_iter = list_iterator_create(feature_cache); while ((feat_ptr = (job_feature_t *) list_next(feat_iter))) { detail_ptr->feature_list = list_create(feature_list_delete); feature_base = xmalloc(sizeof(job_feature_t)); feature_base->name = xstrdup(feat_ptr->name); feature_base->op_code = feat_ptr->op_code; list_append(detail_ptr->feature_list, feature_base); while ((feat_ptr->paren > 0) && ((feat_ptr = (job_feature_t *) list_next(feat_iter)))) { feature_base = xmalloc(sizeof(job_feature_t)); feature_base->name = xstrdup(feat_ptr->name); feature_base->op_code = feat_ptr->op_code; list_append(detail_ptr->feature_list, feature_base); } feature_base->op_code = FEATURE_OP_END; if ((job_req_node_filter(job_ptr, *avail_bitmap, true) == SLURM_SUCCESS) && (bit_set_count(*avail_bitmap) >= min_nodes)) { preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); FREE_NULL_LIST(preemptee_candidates); if ((rc == SLURM_SUCCESS) && ((low_start == 0) || (low_start > job_ptr->start_time))) { low_start = job_ptr->start_time; low_bitmap = *avail_bitmap; *avail_bitmap = NULL; } } FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap = bit_copy(tmp_bitmap); list_destroy(detail_ptr->feature_list); } list_iterator_destroy(feat_iter); FREE_NULL_BITMAP(tmp_bitmap); if (low_start) { job_ptr->start_time = low_start; rc = SLURM_SUCCESS; FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap = low_bitmap; } else { rc = ESLURM_NODES_BUSY; FREE_NULL_BITMAP(low_bitmap); } /* Restore the original feature information */ detail_ptr->feature_list = feature_cache; } else if (detail_ptr->feature_list) { if ((job_req_node_filter(job_ptr, *avail_bitmap, true) != SLURM_SUCCESS) || (bit_set_count(*avail_bitmap) < min_nodes)) { rc = ESLURM_NODES_BUSY; } else { preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); } } else { /* Try to schedule the job. First on dedicated nodes * then on shared nodes (if so configured). */ uint16_t orig_shared; time_t now = time(NULL); char str[100]; preemptee_candidates = slurm_find_preemptable_jobs(job_ptr); orig_shared = job_ptr->details->share_res; job_ptr->details->share_res = 0; tmp_bitmap = bit_copy(*avail_bitmap); if (exc_core_bitmap) { bit_fmt(str, (sizeof(str) - 1), exc_core_bitmap); debug2("%s exclude core bitmap: %s", __func__, str); } rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); job_ptr->details->share_res = orig_shared; if (((rc != SLURM_SUCCESS) || (job_ptr->start_time > now)) && (orig_shared != 0)) { FREE_NULL_BITMAP(*avail_bitmap); *avail_bitmap = tmp_bitmap; rc = select_g_job_test(job_ptr, *avail_bitmap, min_nodes, max_nodes, req_nodes, SELECT_MODE_WILL_RUN, preemptee_candidates, NULL, exc_core_bitmap); } else FREE_NULL_BITMAP(tmp_bitmap); } FREE_NULL_LIST(preemptee_candidates); return rc; } /* Terminate backfill_agent */ extern void stop_backfill_agent(void) { slurm_mutex_lock(&term_lock); stop_backfill = true; slurm_cond_signal(&term_cond); slurm_mutex_unlock(&term_lock); } /* Sleep for at least specified time, returns actual sleep time in usec */ static uint32_t _my_sleep(int64_t usec) { int64_t nsec; uint32_t sleep_time = 0; struct timespec ts = {0, 0}; struct timeval tv1 = {0, 0}, tv2 = {0, 0}; if (gettimeofday(&tv1, NULL)) { /* Some error */ sleep(1); return 1000000; } nsec = tv1.tv_usec + usec; nsec *= 1000; ts.tv_sec = tv1.tv_sec + (nsec / 1000000000); ts.tv_nsec = nsec % 1000000000; slurm_mutex_lock(&term_lock); if (!stop_backfill) slurm_cond_timedwait(&term_cond, &term_lock, &ts); slurm_mutex_unlock(&term_lock); if (gettimeofday(&tv2, NULL)) return usec; sleep_time = (tv2.tv_sec - tv1.tv_sec) * 1000000; sleep_time += tv2.tv_usec; sleep_time -= tv1.tv_usec; return sleep_time; } static void _load_config(void) { char *sched_params, *tmp_ptr, *tmp_str = NULL; sched_params = slurm_get_sched_params(); debug_flags = slurm_get_debug_flags(); if ((tmp_ptr = xstrcasestr(sched_params, "bf_interval="))) { backfill_interval = atoi(tmp_ptr + 12); if (backfill_interval < 1 || backfill_interval > MAX_BACKFILL_INTERVAL) { error("Invalid SchedulerParameters bf_interval: %d", backfill_interval); backfill_interval = BACKFILL_INTERVAL; } } else { backfill_interval = BACKFILL_INTERVAL; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_time="))) { bf_max_time = atoi(tmp_ptr + 12); if (bf_max_time < 1 || bf_max_time > MAX_BF_MAX_TIME) { error("Invalid SchedulerParameters bf_max_time:" " %d", bf_max_time); bf_max_time = backfill_interval; } } else { bf_max_time = backfill_interval; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_window="))) { backfill_window = atoi(tmp_ptr + 10) * 60; /* mins to secs */ if (backfill_window < 1 || backfill_window > MAX_BACKFILL_WINDOW) { error("Invalid SchedulerParameters bf_window: %d", backfill_window); backfill_window = BACKFILL_WINDOW; } } else { backfill_window = BACKFILL_WINDOW; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_test="))) max_backfill_job_cnt = atoi(tmp_ptr + 16); else if ((tmp_ptr = xstrcasestr(sched_params, "max_job_bf="))) { fatal("Invalid parameter max_job_bf. The option is no longer supported, please use bf_max_job_test instead."); } else max_backfill_job_cnt = 100; if (max_backfill_job_cnt < 1 || max_backfill_job_cnt > MAX_BF_MAX_JOB_TEST) { error("Invalid SchedulerParameters bf_max_job_test: %d", max_backfill_job_cnt); max_backfill_job_cnt = 100; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_resolution="))) { backfill_resolution = atoi(tmp_ptr + 14); if (backfill_resolution < 1 || backfill_resolution > MAX_BACKFILL_RESOLUTION) { error("Invalid SchedulerParameters bf_resolution: %d", backfill_resolution); backfill_resolution = BACKFILL_RESOLUTION; } } else { backfill_resolution = BACKFILL_RESOLUTION; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_array_resv="))) { bf_max_job_array_resv = atoi(tmp_ptr + 22); if (bf_max_job_array_resv < 0 || bf_max_job_array_resv > MAX_BF_MAX_JOB_ARRAY_RESV) { error("Invalid SchedulerParameters bf_max_job_array_resv: %d", bf_max_job_array_resv); bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; } } else { bf_max_job_array_resv = BF_MAX_JOB_ARRAY_RESV; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_part="))) { max_backfill_job_per_part = atoi(tmp_ptr + 16); if (max_backfill_job_per_part < 0) { error("Invalid SchedulerParameters bf_max_job_part: %d", max_backfill_job_per_part); max_backfill_job_per_part = 0; } } else { max_backfill_job_per_part = 0; } if ((max_backfill_job_per_part != 0) && (max_backfill_job_per_part >= max_backfill_job_cnt)) { error("bf_max_job_part >= bf_max_job_test (%u >= %u)", max_backfill_job_per_part, max_backfill_job_cnt); } if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_start="))) { max_backfill_jobs_start = atoi(tmp_ptr + 17); if (max_backfill_jobs_start < 0 || max_backfill_jobs_start > MAX_BF_MAX_JOB_START) { error("Invalid SchedulerParameters bf_max_job_start: %d", max_backfill_jobs_start); max_backfill_jobs_start = 0; } } else { max_backfill_jobs_start = 0; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_user="))) { max_backfill_job_per_user = atoi(tmp_ptr + 16); if (max_backfill_job_per_user < 0) { error("Invalid SchedulerParameters bf_max_job_user: %d", max_backfill_job_per_user); max_backfill_job_per_user = 0; } } else { max_backfill_job_per_user = 0; } if ((max_backfill_job_per_user != 0) && (max_backfill_job_per_user > max_backfill_job_cnt)) { info("warning: bf_max_job_user > bf_max_job_test (%u > %u)", max_backfill_job_per_user, max_backfill_job_cnt); } bf_job_part_count_reserve = 0; if ((tmp_ptr = xstrcasestr(sched_params, "bf_job_part_count_reserve="))) { int job_cnt = atoi(tmp_ptr + 26); if (job_cnt < 0 || job_cnt > MAX_BF_JOB_PART_COUNT_RESERVE) { error("Invalid SchedulerParameters bf_job_part_count_reserve: %d", job_cnt); } else { bf_job_part_count_reserve = job_cnt; } } if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_user_part="))) { max_backfill_job_per_user_part = atoi(tmp_ptr + 21); if (max_backfill_job_per_user_part < 0) { error("Invalid SchedulerParameters bf_max_job_user_part: %d", max_backfill_job_per_user_part); max_backfill_job_per_user_part = 0; } } else { max_backfill_job_per_user_part = 0; } if ((max_backfill_job_per_user_part != 0) && (max_backfill_job_per_user_part > max_backfill_job_cnt)) { info("warning: bf_max_job_user_part > bf_max_job_test (%u > %u)", max_backfill_job_per_user_part, max_backfill_job_cnt); } if ((tmp_ptr = xstrcasestr(sched_params, "bf_max_job_assoc="))) { max_backfill_job_per_assoc = atoi(tmp_ptr + 17); if (max_backfill_job_per_assoc < 0) { error("Invalid SchedulerParameters bf_max_job_assoc: %d", max_backfill_job_per_assoc); max_backfill_job_per_assoc = 0; } } else { max_backfill_job_per_assoc = 0; } if ((max_backfill_job_per_assoc != 0) && (max_backfill_job_per_assoc > max_backfill_job_cnt)) { info("warning: bf_max_job_assoc > bf_max_job_test (%u > %u)", max_backfill_job_per_assoc, max_backfill_job_cnt); } if ((max_backfill_job_per_assoc != 0) && (max_backfill_job_per_user != 0)) { error("Both bf_max_job_user and bf_max_job_assoc are set: " "bf_max_job_assoc taking precedence."); max_backfill_job_per_user = 0; } bf_min_age_reserve = 0; if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_age_reserve="))) { int min_age = atoi(tmp_ptr + 19); if (min_age < 0 || min_age > MAX_BF_MIN_AGE_RESERVE) { error("Invalid SchedulerParameters bf_min_age_reserve: %d", min_age); } else { bf_min_age_reserve = min_age; } } bf_min_prio_reserve = 0; if ((tmp_ptr = xstrcasestr(sched_params, "bf_min_prio_reserve="))) { unsigned long long int min_prio; tmp_ptr += 20; min_prio = strtoull(tmp_ptr, NULL, 10); if (!min_prio || min_prio > MAX_BF_MIN_PRIO_RESERVE) { error("Invalid SchedulerParameters bf_min_prio_reserve: %llu", min_prio); } else { bf_min_prio_reserve = (uint32_t) min_prio; } } /* bf_continue makes backfill continue where it was if interrupted */ if (xstrcasestr(sched_params, "bf_continue")) { backfill_continue = true; } else { backfill_continue = false; } if (xstrcasestr(sched_params, "assoc_limit_stop")) { assoc_limit_stop = true; } else { assoc_limit_stop = false; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_interval="))) { yield_interval = atoi(tmp_ptr + 18); if ((yield_interval <= 0) || (yield_interval > MAX_BF_YIELD_INTERVAL)) { error("Invalid backfill scheduler bf_yield_interval: %d", yield_interval); yield_interval = YIELD_INTERVAL; } } else { yield_interval = YIELD_INTERVAL; } if ((tmp_ptr = xstrcasestr(sched_params, "bf_yield_sleep="))) { yield_sleep = (int64_t) atoll(tmp_ptr + 15); if (yield_sleep <= 0 || yield_sleep > MAX_YIELD_SLEEP) { error("Invalid backfill scheduler bf_yield_sleep: %d", yield_sleep); yield_sleep = YIELD_SLEEP; } } else { yield_sleep = YIELD_SLEEP; } bf_hetjob_prio = 0; tmp_str = xstrdup(sched_params); if ((tmp_ptr = xstrcasestr(tmp_str, "bf_hetjob_prio="))) { tmp_ptr = strtok(tmp_ptr + 15, ","); if (!xstrcasecmp(tmp_ptr, "min")) bf_hetjob_prio |= HETJOB_PRIO_MIN; else if (!xstrcasecmp(tmp_ptr, "max")) bf_hetjob_prio |= HETJOB_PRIO_MAX; else if (!xstrcasecmp(tmp_ptr, "avg")) bf_hetjob_prio |= HETJOB_PRIO_AVG; else error("Invalid SchedulerParameters bf_hetjob_prio: %s", tmp_ptr); } xfree(tmp_str); bf_hetjob_immediate = false; if (xstrcasestr(sched_params, "bf_hetjob_immediate")) bf_hetjob_immediate = true; if (bf_hetjob_immediate && !bf_hetjob_prio) { bf_hetjob_prio |= HETJOB_PRIO_MIN; info("bf_hetjob_immediate automatically sets bf_hetjob_prio=min"); } if (xstrcasestr(sched_params, "bf_one_resv_per_job")) bf_one_resv_per_job = true; else bf_one_resv_per_job = false; if (xstrcasestr(sched_params, "bf_running_job_reserve")) bf_running_job_reserve = true; else bf_running_job_reserve = false; if ((tmp_ptr = xstrcasestr(sched_params, "max_rpc_cnt="))) max_rpc_cnt = atoi(tmp_ptr + 12); else if ((tmp_ptr = xstrcasestr(sched_params, "max_rpc_count="))) max_rpc_cnt = atoi(tmp_ptr + 14); else max_rpc_cnt = 0; if ((max_rpc_cnt < 0) || (max_rpc_cnt > MAX_MAX_RPC_CNT)) { error("Invalid SchedulerParameters max_rpc_cnt: %d", max_rpc_cnt); max_rpc_cnt = 0; } xfree(sched_params); } /* Note that slurm.conf has changed */ extern void backfill_reconfig(void) { slurm_mutex_lock(&config_lock); config_flag = true; slurm_mutex_unlock(&config_lock); } /* Update backfill scheduling statistics * IN tv1 - start time * IN tv2 - end (current) time * IN node_space_recs - count of records in resouces/time table being tested */ static void _do_diag_stats(struct timeval *tv1, struct timeval *tv2, int node_space_recs) { uint32_t delta_t, real_time; delta_t = (tv2->tv_sec - tv1->tv_sec) * 1000000; delta_t += tv2->tv_usec; delta_t -= tv1->tv_usec; real_time = delta_t - bf_sleep_usec; slurmctld_diag_stats.bf_cycle_counter++; slurmctld_diag_stats.bf_cycle_sum += real_time; slurmctld_diag_stats.bf_cycle_last = real_time; slurmctld_diag_stats.bf_depth_sum += slurmctld_diag_stats.bf_last_depth; slurmctld_diag_stats.bf_depth_try_sum += slurmctld_diag_stats.bf_last_depth_try; if (slurmctld_diag_stats.bf_cycle_last > slurmctld_diag_stats.bf_cycle_max) { slurmctld_diag_stats.bf_cycle_max = slurmctld_diag_stats. bf_cycle_last; } slurmctld_diag_stats.bf_table_size = node_space_recs; slurmctld_diag_stats.bf_table_size_sum += node_space_recs; } /* backfill_agent - detached thread periodically attempts to backfill jobs */ extern void *backfill_agent(void *args) { time_t now; double wait_time; static time_t last_backfill_time = 0; /* Read config and partitions; Write jobs and nodes */ slurmctld_lock_t all_locks = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; bool load_config; bool short_sleep = false; int backfill_cnt = 0; #if HAVE_SYS_PRCTL_H if (prctl(PR_SET_NAME, "bckfl", NULL, NULL, NULL) < 0) { error("%s: cannot set my name to %s %m", __func__, "backfill"); } #endif _load_config(); last_backfill_time = time(NULL); het_job_list = list_create(_het_job_map_del); while (!stop_backfill) { if (short_sleep) _my_sleep(USEC_IN_SEC); else _my_sleep((int64_t) backfill_interval * USEC_IN_SEC); if (stop_backfill) break; if (slurmctld_config.scheduling_disabled) continue; list_flush(het_job_list); slurm_mutex_lock(&config_lock); if (config_flag) { config_flag = false; load_config = true; } else { load_config = false; } slurm_mutex_unlock(&config_lock); if (load_config) _load_config(); now = time(NULL); wait_time = difftime(now, last_backfill_time); if ((wait_time < backfill_interval) || job_is_completing(NULL) || _many_pending_rpcs() || !avail_front_end(NULL) || !_more_work(last_backfill_time)) { short_sleep = true; continue; } slurm_mutex_lock(&check_bf_running_lock); slurmctld_diag_stats.bf_active = 1; slurm_mutex_unlock(&check_bf_running_lock); lock_slurmctld(all_locks); if ((backfill_cnt++ % 2) == 0) _het_job_start_clear(); (void) _attempt_backfill(); last_backfill_time = time(NULL); (void) bb_g_job_try_stage_in(); unlock_slurmctld(all_locks); slurm_mutex_lock(&check_bf_running_lock); slurmctld_diag_stats.bf_active = 0; slurm_mutex_unlock(&check_bf_running_lock); short_sleep = false; } FREE_NULL_LIST(het_job_list); xhash_free(user_usage_map); /* May have been init'ed if used */ return NULL; } /* * Clear the start_time and sched_nodes for all pending jobs. This is used to * ensure that a job which can run in multiple partitions has its start_time and * sched_nodes set to the partition offering the earliest start_time. */ static int _clear_job_estimates(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *) x; if (IS_JOB_PENDING(job_ptr)) { job_ptr->start_time = 0; xfree(job_ptr->sched_nodes); } return SLURM_SUCCESS; } /* * Return non-zero to break the backfill loop if change in job, node or * partition state or the backfill scheduler needs to be stopped. */ static int _yield_locks(int64_t usec) { slurmctld_lock_t all_locks = { READ_LOCK, WRITE_LOCK, WRITE_LOCK, READ_LOCK, READ_LOCK }; time_t job_update, node_update, part_update; bool load_config = false; int yield_rpc_cnt; yield_rpc_cnt = MAX((max_rpc_cnt / 10), 20); job_update = last_job_update; node_update = last_node_update; part_update = last_part_update; unlock_slurmctld(all_locks); while (!stop_backfill) { bf_sleep_usec += _my_sleep(usec); slurm_mutex_lock(&slurmctld_config.thread_count_lock); if ((max_rpc_cnt == 0) || (slurmctld_config.server_thread_count <= yield_rpc_cnt)) { slurm_mutex_unlock(&slurmctld_config.thread_count_lock); break; } verbose("backfill: continuing to yield locks, %d RPCs pending", slurmctld_config.server_thread_count); slurm_mutex_unlock(&slurmctld_config.thread_count_lock); } lock_slurmctld(all_locks); slurm_mutex_lock(&config_lock); if (config_flag) load_config = true; slurm_mutex_unlock(&config_lock); if ((last_job_update == job_update) && (last_node_update == node_update) && (last_part_update == part_update) && (! stop_backfill) && (! load_config)) return 0; else return 1; } /* Test if this job still has access to the specified partition. The job's * available partitions may have changed when locks were released */ static bool _job_part_valid(job_record_t *job_ptr, part_record_t *part_ptr) { part_record_t *avail_part_ptr; ListIterator part_iterator; bool rc = false; if (job_ptr->part_ptr_list) { part_iterator = list_iterator_create(job_ptr->part_ptr_list); while ((avail_part_ptr = list_next(part_iterator))) { if (avail_part_ptr == part_ptr) { rc = true; break; } } list_iterator_destroy(part_iterator); } else if (job_ptr->part_ptr == part_ptr) { rc = true; } return rc; } /* Determine if job in the backfill queue is still runnable. * Job state could change when lock are periodically released */ static bool _job_runnable_now(job_record_t *job_ptr) { uint16_t cleaning = 0; if (IS_JOB_REVOKED(job_ptr)) return false; if (!IS_JOB_PENDING(job_ptr)) /* Started in other partition */ return false; if (job_ptr->priority == 0) /* Job has been held */ return false; if (IS_JOB_COMPLETING(job_ptr)) /* Started, requeue and completing */ return false; /* * Already reserved resources for either bf_max_job_array_resv or * max_run_tasks number of jobs in the array. If max_run_tasks is 0, it * wasn't set, so ignore it. */ if (job_ptr->array_recs && ((job_ptr->array_recs->pend_run_tasks >= bf_max_job_array_resv) || (job_ptr->array_recs->max_run_tasks && (job_ptr->array_recs->pend_run_tasks >= job_ptr->array_recs->max_run_tasks)))) return false; select_g_select_jobinfo_get(job_ptr->select_jobinfo, SELECT_JOBDATA_CLEANING, &cleaning); if (cleaning) /* Started, requeue and completing */ return false; return true; } static void _restore_preempt_state(job_record_t *job_ptr, time_t *tmp_preempt_start_time, bool *tmp_preempt_in_progress) { if ((*tmp_preempt_start_time != 0) && (job_ptr->details->preempt_start_time == 0)) { job_ptr->details->preempt_start_time = *tmp_preempt_start_time; job_ptr->preempt_in_progress = *tmp_preempt_in_progress; *tmp_preempt_start_time = 0; *tmp_preempt_in_progress = false; } } /* * IN/OUT: prio to be adjusted * IN: value from current component partition */ static void _adjust_hetjob_prio(uint32_t *prio, uint32_t val) { if (!*prio) *prio = val; else if (bf_hetjob_prio & HETJOB_PRIO_MIN) *prio = MIN(*prio, val); else if (bf_hetjob_prio & HETJOB_PRIO_MAX) *prio = MAX(*prio, val); else if (bf_hetjob_prio & HETJOB_PRIO_AVG) *prio += val; } /* * IN: job_record pointer of a hetjob leader (caller responsible) * RET: [min|max|avg] Priority of all components from same hetjob */ static uint32_t _hetjob_calc_prio(job_record_t *het_leader) { job_record_t *het_comp = NULL; uint32_t prio = 0, tmp = 0, cnt = 0, i = 0, nparts = 0; ListIterator iter = NULL; if (bf_hetjob_prio & HETJOB_PRIO_MIN) prio = INFINITE; iter = list_iterator_create(het_leader->het_job_list); while ((het_comp = list_next(iter))) { if (het_comp->part_ptr_list && het_comp->priority_array && (nparts = list_count(het_comp->part_ptr_list))) { for (i = 0; i < nparts; i++) { tmp = het_comp->priority_array[i]; if (tmp == 0) { /* job held */ prio = 0; break; } _adjust_hetjob_prio(&prio, tmp); cnt++; } if (prio == 0) /* job held */ break; } else { tmp = het_comp->priority; if (tmp == 0) { /* job held */ prio = 0; break; } _adjust_hetjob_prio(&prio, tmp); cnt++; } if ((bf_hetjob_prio & HETJOB_PRIO_MIN) && (prio == 1)) break; /* Can not get lower */ } list_iterator_destroy(iter); if (prio && cnt && (bf_hetjob_prio & HETJOB_PRIO_AVG)) prio /= cnt; return prio; } /* * IN: job_record pointer of a hetjob leader (caller responsible) * RET: [min|max|avg] PriorityTier of all components from same hetjob */ static uint32_t _hetjob_calc_prio_tier(job_record_t *het_leader) { job_record_t *het_comp = NULL; part_record_t *part_ptr = NULL; uint32_t prio_tier = 0, tmp = 0, cnt = 0; ListIterator iter = NULL, iter2 = NULL; if (bf_hetjob_prio & HETJOB_PRIO_MIN) prio_tier = NO_VAL16 - 1; iter = list_iterator_create(het_leader->het_job_list); while ((het_comp = list_next(iter))) { if (het_comp->part_ptr_list && list_count(het_comp->part_ptr_list)) { iter2 = list_iterator_create(het_comp->part_ptr_list); while ((part_ptr = list_next(iter2))) { tmp = part_ptr->priority_tier; _adjust_hetjob_prio(&prio_tier, tmp); cnt++; } list_iterator_destroy(iter2); } else { tmp = het_comp->part_ptr->priority_tier; _adjust_hetjob_prio(&prio_tier, tmp); cnt++; } if ((bf_hetjob_prio & HETJOB_PRIO_MIN) && (prio_tier == 0)) break; /* Minimum found. */ if ((bf_hetjob_prio & HETJOB_PRIO_MAX) && (prio_tier == (NO_VAL16 - 1))) break; /* Maximum found. */ } list_iterator_destroy(iter); if (prio_tier && cnt && (bf_hetjob_prio & HETJOB_PRIO_AVG)) prio_tier /= cnt; return prio_tier; } /* * IN: job_record pointer of a hetjob leader (caller responsible) * RET: true if any component from same hetjob has a reservation */ static bool _hetjob_any_resv(job_record_t *het_leader) { job_record_t *het_comp = NULL; ListIterator iter = NULL; bool any_resv = false; iter = list_iterator_create(het_leader->het_job_list); while (!any_resv && (het_comp = list_next(iter))) { if (het_comp->resv_id != 0) any_resv = true; } list_iterator_destroy(iter); return any_resv; } static int _foreach_het_job_details(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *) x; job_ptr->het_details = (het_job_details_t *)arg; return SLURM_SUCCESS; } static int _bf_reserve_running(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *) x; node_space_handler_t *ns_h = (node_space_handler_t *) arg; node_space_map_t *node_space = ns_h->node_space; int *ns_recs_ptr = ns_h->node_space_recs; time_t start_time = job_ptr->start_time; time_t end_time = job_ptr->end_time; if (!job_ptr || ! IS_JOB_RUNNING(job_ptr)) return SLURM_SUCCESS; if (!job_ptr->job_resrcs || !(job_ptr->job_resrcs->whole_node == WHOLE_NODE_REQUIRED)) return SLURM_SUCCESS; if (slurm_job_preempt_mode(job_ptr) != PREEMPT_MODE_OFF) return SLURM_SUCCESS; bitstr_t *tmp_bitmap = bit_copy(job_ptr->node_bitmap); bit_not(tmp_bitmap); end_time = (end_time / backfill_resolution) * backfill_resolution; _add_reservation(start_time, end_time, tmp_bitmap, node_space, ns_recs_ptr); FREE_NULL_BITMAP(tmp_bitmap); return SLURM_SUCCESS; } static int _set_hetjob_details(void *x, void *arg) { job_record_t *job_ptr = (job_record_t *) x; het_job_details_t *details = NULL; if (IS_JOB_PENDING(job_ptr) && job_ptr->het_job_id && !job_ptr->het_job_offset && job_ptr->het_job_list) { /* * Pending hetjob leader component. Do calculations only once * for whole hetjob. xmalloc memory for 1 het_details struct, * but make the pointer accessible in all hetjob components. */ if (!job_ptr->het_details) job_ptr->het_details = xmalloc(sizeof(het_job_details_t)); details = job_ptr->het_details; details->any_resv = _hetjob_any_resv(job_ptr); details->priority_tier = _hetjob_calc_prio_tier(job_ptr); details->priority = _hetjob_calc_prio(job_ptr); list_for_each(job_ptr->het_job_list, _foreach_het_job_details, details); } return SLURM_SUCCESS; } /* Fetch key from xhash_t item. Called from function ptr */ static void _bf_map_key_id(void *item, const char **key, uint32_t *key_len) { bf_user_usage_t *user = (bf_user_usage_t *)item; xassert(user); *key = (char *)&user->uid; *key_len = sizeof(uid_t); } /* Free item from xhash_t. Called from function ptr */ static void _bf_map_free(void *item) { bf_user_usage_t *user = (bf_user_usage_t *)item; if (!user) return; slurmdb_destroy_bf_usage_members(&user->bf_usage); xfree(user); } /* Allocate new user and add to xhash_t map */ static bf_user_usage_t *_bf_map_add_user(xhash_t *map, uid_t uid) { bf_user_usage_t *user = xmalloc(sizeof(bf_user_usage_t)); user->uid = uid; xhash_add(map, user); return user; } /* Find user usage from uid. Add new empty entry to map if not found */ static slurmdb_bf_usage_t *_bf_map_find_add(xhash_t* map, uid_t uid) { bf_user_usage_t *user; xassert(map != NULL); if (!(user = xhash_get(map, (char *)&uid, sizeof(uid_t)))) user = _bf_map_add_user(map, uid); return &user->bf_usage; } /* * Check if limit exceeded. Reset usage if usage time is before current * scheduling iteration time */ static bool _check_bf_usage( slurmdb_bf_usage_t *usage, int limit, time_t sched_time) { if (usage->last_sched < sched_time) { usage->last_sched = sched_time; usage->count = 0; return false; } return usage->count >= limit; } /* * Check if job exceeds configured count limits * returns true if count exceeded */ static bool _job_exceeds_max_bf_param(job_record_t *job_ptr, time_t sched_start) { slurmdb_bf_usage_t *part_usage = NULL, *user_usage = NULL, *assoc_usage = NULL, *user_part_usage = NULL; slurmdb_assoc_rec_t *assoc_ptr = job_ptr->assoc_ptr; part_record_t *part_ptr = job_ptr->part_ptr; if (max_backfill_job_per_user_part) { xassert(part_ptr->bf_data); user_part_usage = _bf_map_find_add( part_ptr->bf_data->user_usage, job_ptr->user_id); if (_check_bf_usage(user_part_usage, max_backfill_job_per_user_part, sched_start)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already checked %u jobs for user %u on partition %s; skipping job %u, %pJ", max_backfill_job_per_user_part, job_ptr->user_id, job_ptr->part_ptr->name, job_ptr->job_id, job_ptr); return true; } } if (max_backfill_job_per_part) { xassert(part_ptr->bf_data); part_usage = part_ptr->bf_data->job_usage; if (_check_bf_usage(part_usage, max_backfill_job_per_part, sched_start)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already checked %u jobs for partition %s; skipping %pJ", max_backfill_job_per_part, job_ptr->part_ptr->name, job_ptr); return true; } } if (max_backfill_job_per_assoc) { if (assoc_ptr) { if (!assoc_ptr->bf_usage) assoc_ptr->bf_usage = xmalloc(sizeof(slurmdb_bf_usage_t)); assoc_usage = assoc_ptr->bf_usage; if (_check_bf_usage(assoc_usage, max_backfill_job_per_assoc, sched_start)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already checked %u jobs for user %u, assoc %u; skipping %pJ", max_backfill_job_per_assoc, job_ptr->user_id, job_ptr->assoc_id, job_ptr); return true; } } else { /* Null assoc_ptr indicates no database */ if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: no assoc for job %u, required for parameter bf_max_job_per_assoc", job_ptr->job_id); assoc_usage = NULL; } } if (max_backfill_job_per_user) { if (assoc_ptr && assoc_ptr->user_rec) { if (!assoc_ptr->user_rec->bf_usage) assoc_ptr->user_rec->bf_usage = xmalloc(sizeof(slurmdb_bf_usage_t)); user_usage = assoc_ptr->user_rec->bf_usage; } else { /* No database, or user rec missing from assoc */ if (!user_usage_map) user_usage_map = xhash_init(_bf_map_key_id, _bf_map_free); user_usage = _bf_map_find_add(user_usage_map, job_ptr->user_id); } if (_check_bf_usage(user_usage, max_backfill_job_per_user, sched_start)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: have already checked %u jobs for user %u; skipping %pJ", max_backfill_job_per_user, job_ptr->user_id, job_ptr); return true; } } /* Increment our user/partition limit counters as needed */ if (user_part_usage) user_part_usage->count++; if (part_usage) part_usage->count++; if (user_usage) user_usage->count++; if (assoc_usage) assoc_usage->count++; return false; } static int _attempt_backfill(void) { DEF_TIMERS; List job_queue; job_queue_rec_t *job_queue_rec; int bb, i, j, node_space_recs, mcs_select = 0; slurmdb_qos_rec_t *qos_ptr = NULL; job_record_t *job_ptr = NULL; part_record_t *part_ptr; uint32_t end_time, end_reserve, deadline_time_limit, boot_time; uint32_t orig_end_time; uint32_t time_limit, comp_time_limit, orig_time_limit, part_time_limit; uint32_t min_nodes, max_nodes, req_nodes; bitstr_t *active_bitmap = NULL, *avail_bitmap = NULL; bitstr_t *exc_core_bitmap = NULL, *resv_bitmap = NULL; time_t now, sched_start, later_start, start_res, resv_end, window_end; time_t het_job_time, orig_sched_start, orig_start_time = (time_t) 0; node_space_map_t *node_space; struct timeval bf_time1, bf_time2; int rc = 0, error_code; int job_test_count = 0, test_time_count = 0, pend_time; bool already_counted, many_rpcs = false; job_record_t *reject_array_job = NULL; part_record_t *reject_array_part = NULL; uint32_t start_time; time_t config_update = slurmctld_conf.last_update; time_t part_update = last_part_update; struct timeval start_tv; uint32_t test_array_job_id = 0; uint32_t test_array_count = 0; uint32_t job_no_reserve; bool is_job_array_head, resv_overlap = false; uint8_t save_share_res = 0, save_whole_node = 0; int test_fini; uint32_t qos_flags = 0; time_t qos_blocked_until = 0, qos_part_blocked_until = 0; time_t tmp_preempt_start_time = 0; bool tmp_preempt_in_progress = false; bitstr_t *tmp_bitmap = NULL; /* QOS Read lock */ assoc_mgr_lock_t qos_read_lock = { NO_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; bf_sleep_usec = 0; job_start_cnt = 0; if (!fed_mgr_sibs_synced()) { info("backfill: %s returning, federation siblings not synced yet", __func__); return SLURM_SUCCESS; } (void) bb_g_load_state(false); START_TIMER; if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: beginning"); else debug("backfill: beginning"); sched_start = orig_sched_start = now = time(NULL); gettimeofday(&start_tv, NULL); job_queue = build_job_queue(true, true); job_test_count = list_count(job_queue); if (job_test_count == 0) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: no jobs to backfill"); else debug("backfill: no jobs to backfill"); FREE_NULL_LIST(job_queue); return 0; } else debug("backfill: %u jobs to backfill", job_test_count); list_for_each(job_list, _clear_job_estimates, NULL); if (bf_hetjob_prio) list_for_each(job_list, _set_hetjob_details, NULL); gettimeofday(&bf_time1, NULL); slurmctld_diag_stats.bf_queue_len = job_test_count; slurmctld_diag_stats.bf_queue_len_sum += slurmctld_diag_stats. bf_queue_len; job_test_count = 0; slurmctld_diag_stats.bf_last_depth = 0; slurmctld_diag_stats.bf_last_depth_try = 0; slurmctld_diag_stats.bf_when_last_cycle = now; node_space = xmalloc(sizeof(node_space_map_t) * (max_backfill_job_cnt * 2 + 1)); node_space[0].begin_time = sched_start; window_end = sched_start + backfill_window; node_space[0].end_time = window_end; node_space[0].avail_bitmap = bit_copy(avail_node_bitmap); /* Make "resuming" nodes available to be scheduled in backfill */ bit_or(node_space[0].avail_bitmap, rs_node_bitmap); node_space[0].next = 0; node_space_recs = 1; if (bf_running_job_reserve) { node_space_handler_t node_space_handler; node_space_handler.node_space = node_space; node_space_handler.node_space_recs = &node_space_recs; list_for_each(job_list, _bf_reserve_running, &node_space_handler); } if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); if (assoc_limit_stop) { assoc_mgr_lock(&qos_read_lock); list_for_each(assoc_mgr_qos_list, _clear_qos_blocked_times, NULL); assoc_mgr_unlock(&qos_read_lock); } sort_job_queue(job_queue); /* Ignore nodes that have been set as available during this cycle. */ bit_clear_all(bf_ignore_node_bitmap); while (1) { uint32_t bf_array_task_id, bf_job_priority, prio_reserve; bool get_boot_time = false; /* Run some final guaranteed logic after each job iteration */ if (job_ptr) { job_resv_clear_promiscous_flag(job_ptr); fill_array_reasons(job_ptr, reject_array_job); } job_queue_rec = (job_queue_rec_t *) list_pop(job_queue); if (!job_queue_rec) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: reached end of job queue"); break; } job_ptr = job_queue_rec->job_ptr; part_ptr = job_queue_rec->part_ptr; bf_job_priority = job_queue_rec->priority; bf_array_task_id = job_queue_rec->array_task_id; job_queue_rec_prom_resv(job_queue_rec); xfree(job_queue_rec); if (slurmctld_config.shutdown_time || (difftime(time(NULL),orig_sched_start) >= bf_max_time)){ break; } many_rpcs = false; slurm_mutex_lock(&slurmctld_config.thread_count_lock); if ((max_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= max_rpc_cnt)) many_rpcs = true; slurm_mutex_unlock(&slurmctld_config.thread_count_lock); if (many_rpcs || (slurm_delta_tv(&start_tv) >= yield_interval)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: yielding locks after testing " "%u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || (slurmctld_conf.last_update != config_update) || (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing " "%u(%d) jobs", slurmctld_diag_stats.bf_last_depth, job_test_count); } rc = 1; break; } if (stop_backfill) break; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); gettimeofday(&start_tv, NULL); job_test_count = 0; test_time_count = 0; START_TIMER; } if ((job_ptr->array_task_id != bf_array_task_id) && (bf_array_task_id == NO_VAL)) { /* Job array element started in other partition, * reset pointer to "master" job array record */ job_ptr = find_job_record(job_ptr->array_job_id); if (!job_ptr) /* All task array elements started */ continue; } /* Restore preemption state if needed. */ _restore_preempt_state(job_ptr, &tmp_preempt_start_time, &tmp_preempt_in_progress); /* * Establish baseline (worst case) start time for hetjob * Update time once start time estimate established */ _het_job_start_set(job_ptr, (now + YEAR_SECONDS), NO_VAL); if (job_ptr->het_job_id && (job_ptr->state_reason == WAIT_NO_REASON)) { xfree(job_ptr->state_desc); job_ptr->state_reason = WAIT_RESOURCES; } if (!_job_runnable_now(job_ptr)) continue; if (!part_ptr) continue; job_ptr->last_sched_eval = now; job_ptr->part_ptr = part_ptr; job_ptr->priority = bf_job_priority; mcs_select = slurm_mcs_get_select(job_ptr); het_job_time = _het_job_start_find(job_ptr, now); if (het_job_time > (now + backfill_window)) continue; if (job_ptr->state_reason == FAIL_ACCOUNT) { slurmdb_assoc_rec_t assoc_rec; memset(&assoc_rec, 0, sizeof(slurmdb_assoc_rec_t)); assoc_rec.acct = job_ptr->account; assoc_rec.partition = job_ptr->part_ptr->name; assoc_rec.uid = job_ptr->user_id; if (!assoc_mgr_fill_in_assoc(acct_db_conn, &assoc_rec, accounting_enforce, &job_ptr->assoc_ptr, false)) { job_ptr->state_reason = WAIT_NO_REASON; xfree(job_ptr->state_desc); job_ptr->assoc_id = assoc_rec.id; last_job_update = now; } else { debug("backfill: %pJ has invalid association", job_ptr); xfree(job_ptr->state_desc); job_ptr->state_reason = WAIT_ASSOC_RESOURCE_LIMIT; continue; } } if (job_ptr->qos_id) { assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK, NO_LOCK, NO_LOCK }; assoc_mgr_lock(&locks); if (job_ptr->assoc_ptr && (accounting_enforce & ACCOUNTING_ENFORCE_QOS) && ((job_ptr->qos_id >= g_qos_count) || !job_ptr->assoc_ptr->usage || !job_ptr->assoc_ptr->usage->valid_qos || !bit_test(job_ptr->assoc_ptr->usage->valid_qos, job_ptr->qos_id)) && !job_ptr->limit_set.qos) { debug("backfill: %pJ has invalid QOS", job_ptr); assoc_mgr_unlock(&locks); job_fail_qos(job_ptr, __func__); last_job_update = now; continue; } else if (job_ptr->state_reason == FAIL_QOS) { xfree(job_ptr->state_desc); job_ptr->state_reason = WAIT_NO_REASON; last_job_update = now; } assoc_mgr_unlock(&locks); } assoc_mgr_lock(&qos_read_lock); if (job_ptr->qos_ptr) { qos_flags = job_ptr->qos_ptr->flags; qos_blocked_until = job_ptr->qos_ptr->blocked_until; } else { qos_flags = 0; qos_blocked_until = 0; } if (job_ptr->part_ptr->qos_ptr) qos_part_blocked_until = job_ptr->part_ptr->qos_ptr->blocked_until; else qos_part_blocked_until = 0; if (part_policy_valid_qos(job_ptr->part_ptr, job_ptr->qos_ptr, job_ptr) != SLURM_SUCCESS) { assoc_mgr_unlock(&qos_read_lock); continue; } assoc_mgr_unlock(&qos_read_lock); if (!assoc_limit_stop && !acct_policy_job_runnable_pre_select(job_ptr, false)) { continue; } if (!(prio_reserve = acct_policy_get_prio_thresh( job_ptr, false))) prio_reserve = bf_min_prio_reserve; if (prio_reserve && (debug_flags & DEBUG_FLAG_BACKFILL)) info("backfill: %pJ has a prio_reserve of %u", job_ptr, prio_reserve); job_no_reserve = 0; if (prio_reserve && (job_ptr->priority < prio_reserve)) { job_no_reserve = TEST_NOW_ONLY; } else if (bf_min_age_reserve && job_ptr->details->begin_time) { pend_time = difftime(time(NULL), job_ptr->details->begin_time); if (pend_time < bf_min_age_reserve) job_no_reserve = TEST_NOW_ONLY; } if (bf_one_resv_per_job && job_ptr->start_time) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: %pJ already added a backfill reservation. Test immediate start only for partition %s", job_ptr, job_ptr->part_ptr->name); job_no_reserve = TEST_NOW_ONLY; } /* If partition data is needed and not yet initialized, do so */ if (!job_ptr->part_ptr->bf_data && (bf_job_part_count_reserve || max_backfill_job_per_user_part || max_backfill_job_per_part)) { bf_part_data_t *part_data = xmalloc(sizeof(bf_part_data_t)); part_data->job_usage = xmalloc(sizeof(slurmdb_bf_usage_t)); part_data->resv_usage = xmalloc(sizeof(slurmdb_bf_usage_t)); part_data->user_usage = xhash_init(_bf_map_key_id, _bf_map_free); job_ptr->part_ptr->bf_data = part_data; } if ((job_no_reserve == 0) && bf_job_part_count_reserve) { if (_check_bf_usage( job_ptr->part_ptr->bf_data->resv_usage, bf_job_part_count_reserve, orig_sched_start)) job_no_reserve = TEST_NOW_ONLY; } if (tmp_preempt_in_progress) continue; /* scheduled in another partition */ orig_start_time = job_ptr->start_time; orig_time_limit = job_ptr->time_limit; if (job_ptr->array_recs && (job_ptr->array_task_id == NO_VAL)) is_job_array_head = true; else is_job_array_head = false; next_task: /* * Save the current preemption state. Reset preemption state * in the job_ptr so a job array can preempt multiple jobs. */ tmp_preempt_in_progress = job_ptr->preempt_in_progress; tmp_preempt_start_time = job_ptr->details->preempt_start_time; job_ptr->details->preempt_start_time = 0; job_ptr->preempt_in_progress = false; job_test_count++; slurmctld_diag_stats.bf_last_depth++; already_counted = false; if (!IS_JOB_PENDING(job_ptr) || /* Started in other partition */ (job_ptr->priority == 0)) /* Job has been held */ continue; if (!avail_front_end(job_ptr)) continue; /* No available frontend for this job */ if (!_job_part_valid(job_ptr, part_ptr)) continue; /* Partition change during lock yield */ if ((job_ptr->array_task_id != NO_VAL) || job_ptr->array_recs) { if (reject_array_job && (reject_array_job->array_job_id == job_ptr->array_job_id) && (reject_array_part == part_ptr)) continue; /* already rejected array element */ /* assume reject whole array for now, clear if OK */ reject_array_job = job_ptr; reject_array_part = part_ptr; if (!job_array_start_test(job_ptr)) continue; } job_ptr->part_ptr = part_ptr; if (job_limits_check(&job_ptr, true) != WAIT_NO_REASON) { /* should never happen */ continue; } if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill test for %pJ Prio=%u Partition=%s", job_ptr, job_ptr->priority, job_ptr->part_ptr->name); } /* Test to see if we've exceeded any per user/partition limit */ if (_job_exceeds_max_bf_param(job_ptr, orig_sched_start)) continue; if (((part_ptr->state_up & PARTITION_SCHED) == 0) || (part_ptr->node_bitmap == NULL)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: partition %s not usable", job_ptr->part_ptr->name); continue; } if ((!job_independent(job_ptr)) || (license_job_test(job_ptr, time(NULL), true) != SLURM_SUCCESS)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: %pJ not runable now", job_ptr); continue; } /* Determine minimum and maximum node counts */ error_code = get_node_cnts(job_ptr, qos_flags, part_ptr, &min_nodes, &req_nodes, &max_nodes); if (error_code == ESLURM_ACCOUNTING_POLICY) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: %pJ acct policy node limit", job_ptr); continue; } else if (error_code == ESLURM_REQUESTED_PART_CONFIG_UNAVAILABLE) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: %pJ node count too high", job_ptr); continue; } else if (error_code != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: error setting nodes for %pJ: %s", job_ptr, slurm_strerror(error_code)); continue; } /* test of deadline */ now = time(NULL); deadline_time_limit = 0; if ((job_ptr->deadline) && (job_ptr->deadline != NO_VAL)) { if (!deadline_ok(job_ptr, "backfill")) continue; deadline_time_limit = (job_ptr->deadline - now) / 60; } /* Determine job's expected completion time */ if (part_ptr->max_time == INFINITE) part_time_limit = YEAR_MINUTES; else part_time_limit = part_ptr->max_time; if ((job_ptr->time_limit == NO_VAL) || (job_ptr->time_limit == INFINITE)) { time_limit = part_time_limit; job_ptr->limit_set.time = 1; } else { if (part_ptr->max_time == INFINITE) time_limit = job_ptr->time_limit; else time_limit = MIN(job_ptr->time_limit, part_time_limit); } if (deadline_time_limit) comp_time_limit = MIN(time_limit, deadline_time_limit); else comp_time_limit = time_limit; if ((qos_flags & QOS_FLAG_NO_RESERVE) && slurm_get_preempt_mode()) time_limit = job_ptr->time_limit = 1; else if (job_ptr->time_min && (job_ptr->time_min < time_limit)) time_limit = job_ptr->time_limit = job_ptr->time_min; later_start = now; if (assoc_limit_stop) { if (qos_blocked_until > later_start) { later_start = qos_blocked_until; if (debug_flags & DEBUG_FLAG_BACKFILL) info("QOS blocked_until move start_res to %ld", later_start); } if (qos_part_blocked_until > later_start) { later_start = qos_part_blocked_until; if (debug_flags & DEBUG_FLAG_BACKFILL) info("Part QOS blocked_until move start_res to %ld", later_start); } } TRY_LATER: if (slurmctld_config.shutdown_time || (difftime(time(NULL), orig_sched_start) >= bf_max_time)) { _set_job_time_limit(job_ptr, orig_time_limit); break; } test_time_count++; many_rpcs = false; slurm_mutex_lock(&slurmctld_config.thread_count_lock); if ((max_rpc_cnt > 0) && (slurmctld_config.server_thread_count >= max_rpc_cnt)) many_rpcs = true; slurm_mutex_unlock(&slurmctld_config.thread_count_lock); if (many_rpcs || (slurm_delta_tv(&start_tv) >= yield_interval)) { uint32_t save_time_limit = job_ptr->time_limit; _set_job_time_limit(job_ptr, orig_time_limit); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: yielding locks after testing " "%u(%d) jobs tested, %u time slots, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, test_time_count, TIME_STR); } if ((_yield_locks(yield_sleep) && !backfill_continue) || (slurmctld_conf.last_update != config_update) || (last_part_update != part_update)) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: system state changed, " "breaking out after testing " "%u(%d) jobs", slurmctld_diag_stats.bf_last_depth, job_test_count); } rc = 1; break; } if (stop_backfill) break; /* Reset backfill scheduling timers, resume testing */ sched_start = time(NULL); gettimeofday(&start_tv, NULL); job_test_count = 1; test_time_count = 0; START_TIMER; /* * With bf_continue configured, the original job could * have been scheduled. Revalidate the job record here. */ if (!_job_runnable_now(job_ptr)) continue; if (!avail_front_end(job_ptr)) continue; /* No available frontend */ if (!job_independent(job_ptr)) { /* No longer independent * (e.g. another singleton started) */ continue; } job_ptr->time_limit = save_time_limit; job_ptr->part_ptr = part_ptr; } FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); start_res = MAX(later_start, het_job_time); resv_end = 0; later_start = 0; /* Determine impact of any advance reservations */ j = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap, &resv_overlap, false); if (j != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: %pJ reservation defer", job_ptr); _set_job_time_limit(job_ptr, orig_time_limit); continue; } if (start_res > now) end_time = (time_limit * 60) + start_res; else end_time = (time_limit * 60) + now; if (end_time < now) /* Overflow 32-bits */ end_time = INFINITE; if (resv_overlap) resv_end = find_resv_end(start_res); /* Identify usable nodes for this job */ bit_and(avail_bitmap, part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); bit_and_not(avail_bitmap, bf_ignore_node_bitmap); filter_by_node_owner(job_ptr, avail_bitmap); filter_by_node_mcs(job_ptr, mcs_select, avail_bitmap); tmp_bitmap = bit_copy(avail_bitmap); for (j = 0; ; ) { if ((node_space[j].end_time > start_res) && node_space[j].next && (later_start == 0)) { int tmp = node_space[j].next; bitstr_t *next_bitmap = bit_copy(tmp_bitmap); bitstr_t *current_bitmap = bit_copy(avail_bitmap); bit_and(next_bitmap, node_space[tmp].avail_bitmap); bit_and(current_bitmap, node_space[j].avail_bitmap); /* * Normally later_start is set at the end of the * first backfill reservation when the select * plugin predicts start time after later_start. * Then it goes to TRY_LATER and tries again on * a new set of nodes to check if the job can * start earlier. But if the next set of nodes * is a subset of the currently tested ones then * calling _try_sched (expensive function) would * be useless and would impact performance. */ if (!bit_super_set(next_bitmap, current_bitmap)) later_start = node_space[j].end_time; FREE_NULL_BITMAP(next_bitmap); FREE_NULL_BITMAP(current_bitmap); } if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } FREE_NULL_BITMAP(tmp_bitmap); if (resv_end && (++resv_end < window_end) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (job_ptr->details->exc_node_bitmap) { bit_and_not(avail_bitmap, job_ptr->details->exc_node_bitmap); } /* Test if insufficient nodes remain OR * required nodes missing OR * nodes lack features OR * no change since previously tested nodes (only changes * in other partition nodes) */ if ((bit_set_count(avail_bitmap) < min_nodes) || ((job_ptr->details->req_node_bitmap) && (!bit_super_set(job_ptr->details->req_node_bitmap, avail_bitmap))) || (job_req_node_filter(job_ptr, avail_bitmap, true))) { if (later_start && !job_no_reserve) { job_ptr->start_time = 0; goto TRY_LATER; } /* Job can not start until too far in the future */ _set_job_time_limit(job_ptr, orig_time_limit); /* * Use orig_start_time if job can't * start in different partition it will be 0 */ job_ptr->start_time = orig_start_time; continue; } /* Identify nodes which are definitely off limits */ FREE_NULL_BITMAP(resv_bitmap); resv_bitmap = bit_copy(avail_bitmap); bit_not(resv_bitmap); /* this is the time consuming operation */ debug2("backfill: entering _try_sched for %pJ.", job_ptr); if (!already_counted) { slurmctld_diag_stats.bf_last_depth_try++; already_counted = true; } if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_job_test(job_ptr, avail_bitmap, start_res); test_fini = -1; build_active_feature_bitmap(job_ptr, avail_bitmap, &active_bitmap); job_ptr->bit_flags |= BACKFILL_TEST; job_ptr->bit_flags |= job_no_reserve; /* 0 or TEST_NOW_ONLY */ if (active_bitmap) { j = _try_sched(job_ptr, &active_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); if (j == SLURM_SUCCESS) { FREE_NULL_BITMAP(avail_bitmap); avail_bitmap = active_bitmap; active_bitmap = NULL; test_fini = 1; } else { if (node_features_g_overlap(active_bitmap)) get_boot_time = true; FREE_NULL_BITMAP(active_bitmap); save_share_res = job_ptr->details->share_res; save_whole_node = job_ptr->details->whole_node; job_ptr->details->share_res = 0; job_ptr->details->whole_node = 1; if (!save_whole_node) job_ptr->bit_flags |= BF_WHOLE_NODE_TEST; test_fini = 0; } } boot_time = 0; if (test_fini == 0) { /* Unable to start job using currently active features, * need to try using features which can be made * available after node reboot */ bitstr_t *tmp_core_bitmap = NULL; bitstr_t *tmp_node_bitmap = NULL; debug2("backfill: entering _try_sched for %pJ. Need to use features which can be made available after node reboot", job_ptr); /* Determine impact of any advance reservations */ resv_end = 0; j = job_test_resv(job_ptr, &start_res, false, &tmp_node_bitmap, &tmp_core_bitmap, &resv_overlap, true); if (resv_overlap) resv_end = find_resv_end(start_res); if (resv_end && (++resv_end < window_end) && ((later_start == 0) || (resv_end < later_start))) { later_start = resv_end; } if (j == SLURM_SUCCESS) { FREE_NULL_BITMAP(exc_core_bitmap); exc_core_bitmap = tmp_core_bitmap; bit_and(avail_bitmap, tmp_node_bitmap); FREE_NULL_BITMAP(tmp_node_bitmap); } if (get_boot_time) boot_time = node_features_g_boot_time(); orig_end_time = end_time; end_time += boot_time; for (j = 0; ; ) { if (node_space[j].end_time <= start_res) ; else if (node_space[j].begin_time <= end_time) { if (node_space[j].begin_time > orig_end_time) bit_and(avail_bitmap, node_space[j].avail_bitmap); } else break; if ((j = node_space[j].next) == 0) break; } } if (test_fini != 1) { /* Either active_bitmap was NULL or not usable by the * job. Test using avail_bitmap instead */ j = _try_sched(job_ptr, &avail_bitmap, min_nodes, max_nodes, req_nodes, exc_core_bitmap); if (test_fini == 0) { job_ptr->details->share_res = save_share_res; job_ptr->details->whole_node = save_whole_node; } } job_ptr->bit_flags &= ~BACKFILL_TEST; job_ptr->bit_flags &= ~BF_WHOLE_NODE_TEST; job_ptr->bit_flags &= ~TEST_NOW_ONLY; now = time(NULL); if (j != SLURM_SUCCESS) { _set_job_time_limit(job_ptr, orig_time_limit); if (later_start && !job_no_reserve) { job_ptr->start_time = 0; goto TRY_LATER; } job_ptr->start_time = orig_start_time; continue; /* not runable in this partition */ } if (start_res > job_ptr->start_time) { job_ptr->start_time = start_res; last_job_update = now; } /* * avail_bitmap at this point contains a bitmap of nodes * selected for this job to be allocated */ if ((job_ptr->start_time <= now) && (bit_overlap_any(avail_bitmap, cg_node_bitmap) || bit_overlap_any(avail_bitmap, rs_node_bitmap))) { /* Need to wait for in-progress completion/epilog */ job_ptr->start_time = now + 1; later_start = 0; } if ((job_ptr->start_time <= now) && ((bb = bb_g_job_test_stage_in(job_ptr, true)) != 1)) { if (job_ptr->state_reason != WAIT_NO_REASON) { ; } else if (bb == -1) { xfree(job_ptr->state_desc); job_ptr->state_reason = WAIT_BURST_BUFFER_RESOURCE; job_ptr->start_time = bb_g_job_get_est_start(job_ptr); } else { /* bb == 0 */ xfree(job_ptr->state_desc); job_ptr->state_reason=WAIT_BURST_BUFFER_STAGING; job_ptr->start_time = now + 1; } sched_debug3("%pJ. State=%s. Reason=%s. Priority=%u.", job_ptr, job_state_string(job_ptr->job_state), job_reason_string(job_ptr->state_reason), job_ptr->priority); last_job_update = now; _set_job_time_limit(job_ptr, orig_time_limit); later_start = 0; if (bb == -1) continue; } else if ((job_ptr->het_job_id == 0) && (job_ptr->start_time <= now)) { /* Can start now */ uint32_t save_time_limit = job_ptr->time_limit; uint32_t hard_limit; bool reset_time = false; int rc; /* get fed job lock from origin cluster */ if (fed_mgr_job_lock(job_ptr)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: %pJ can't get fed job lock from origin cluster to backfill job", job_ptr); rc = ESLURM_FED_JOB_LOCK; goto skip_start; } rc = _start_job(job_ptr, resv_bitmap); if (rc == SLURM_SUCCESS) { /* * If the following fails because of network * connectivity, the origin cluster should ask * when it comes back up if the cluster_lock * cluster actually started the job */ fed_mgr_job_start(job_ptr, job_ptr->start_time); } else { fed_mgr_job_unlock(job_ptr); } skip_start: if (qos_flags & QOS_FLAG_NO_RESERVE) { if (orig_time_limit == NO_VAL) { acct_policy_alter_job( job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; job_ptr->limit_set.time = 1; } else { acct_policy_alter_job( job_ptr, orig_time_limit); _set_job_time_limit(job_ptr, orig_time_limit); } } else if ((rc == SLURM_SUCCESS) && job_ptr->time_min) { /* Set time limit as high as possible */ acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; reset_time = true; } else if (orig_time_limit == NO_VAL) { acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; job_ptr->limit_set.time = 1; } else if (deadline_time_limit && (rc == SLURM_SUCCESS)) { acct_policy_alter_job(job_ptr, comp_time_limit); job_ptr->time_limit = comp_time_limit; reset_time = true; } else { acct_policy_alter_job(job_ptr, orig_time_limit); _set_job_time_limit(job_ptr, orig_time_limit); } /* * Only set end_time if start_time is set, * or else end_time will be small (ie. 1969). */ if (IS_JOB_FINISHED(job_ptr)) { /* Zero size or killed on startup */ } else if (job_ptr->start_time) { if (job_ptr->time_limit == INFINITE) hard_limit = YEAR_SECONDS; else hard_limit = job_ptr->time_limit * 60; job_ptr->end_time = job_ptr->start_time + hard_limit; /* * Only set if start_time. end_time must be set * beforehand for _reset_job_time_limit. */ if (reset_time) { _reset_job_time_limit(job_ptr, now, node_space); time_limit = job_ptr->time_limit; } } else if (rc == SLURM_SUCCESS) { error("%s: start_time of 0 on successful " "backfill. This shouldn't happen. :)", __func__); } if ((rc == ESLURM_RESERVATION_BUSY) || (rc == ESLURM_ACCOUNTING_POLICY && !assoc_limit_stop) || (rc == ESLURM_POWER_NOT_AVAIL) || (rc == ESLURM_POWER_RESERVED)) { /* Unknown future start time, just skip job */ job_ptr->start_time = orig_start_time; _set_job_time_limit(job_ptr, orig_time_limit); continue; } else if (rc == ESLURM_ACCOUNTING_POLICY) { /* Unknown future start time. Determining * when it can start with certainty requires * when every running and pending job starts * and ends and tracking all of there resources. * That requires very high overhead, that we * don't want to add. Estimate that it can start * after the next job ends (or in 5 minutes if * we don't have that information yet). */ if (later_start) job_ptr->start_time = later_start; else job_ptr->start_time = now + 500; if (job_ptr->qos_blocking_ptr && job_state_qos_grp_limit( job_ptr->state_reason)) { assoc_mgr_lock(&qos_read_lock); qos_ptr = job_ptr->qos_blocking_ptr; if (qos_ptr->blocked_until < job_ptr->start_time) { qos_ptr->blocked_until = job_ptr->start_time; } assoc_mgr_unlock(&qos_read_lock); } } else if (rc != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: planned start of %pJ failed: %s", job_ptr, slurm_strerror(rc)); } /* Drop through and reserve these resources. * Likely due to state changes during sleep. * Make best-effort based upon original state */ _set_job_time_limit(job_ptr, orig_time_limit); later_start = 0; } else { /* Started this job, move to next one */ /* Clear assumed rejected array status */ reject_array_job = NULL; reject_array_part = NULL; /* Update the database if job time limit * changed and move to next job */ if (save_time_limit != job_ptr->time_limit) jobacct_storage_job_start_direct( acct_db_conn, job_ptr); job_start_cnt++; if (max_backfill_jobs_start && (job_start_cnt >= max_backfill_jobs_start)){ if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: bf_max_job_start" " limit of %d reached", max_backfill_jobs_start); } break; } if (is_job_array_head && (job_ptr->array_task_id != NO_VAL)) { /* Try starting next task of job array */ job_ptr = find_job_record(job_ptr-> array_job_id); if (job_ptr && IS_JOB_PENDING(job_ptr) && (bb_g_job_test_stage_in( job_ptr, false) == 1)) goto next_task; } continue; } } else if (job_ptr->het_job_id != 0) { uint32_t max_time_limit; max_time_limit =_get_job_max_tl(job_ptr, now, node_space); comp_time_limit = MIN(comp_time_limit, max_time_limit); job_ptr->node_cnt_wag = MAX(bit_set_count(avail_bitmap), 1); _het_job_start_set(job_ptr, job_ptr->start_time, comp_time_limit); _set_job_time_limit(job_ptr, orig_time_limit); if (bf_hetjob_immediate && (!max_backfill_jobs_start || (job_start_cnt < max_backfill_jobs_start))) _het_job_start_test(node_space, job_ptr->het_job_id); } if ((job_ptr->start_time > now) && (job_no_reserve != 0)) { if ((orig_start_time != 0) && (orig_start_time < job_ptr->start_time)) { /* Can start earlier in different partition */ job_ptr->start_time = orig_start_time; } _set_job_time_limit(job_ptr, orig_time_limit); continue; } if (later_start && (job_ptr->start_time > later_start)) { /* Try later when some nodes currently reserved for * pending jobs are free */ if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: Try later %pJ later_start %ld", job_ptr, later_start); } job_ptr->start_time = 0; goto TRY_LATER; } start_time = job_ptr->start_time; end_reserve = job_ptr->start_time + boot_time + (time_limit * 60); start_time = (start_time / backfill_resolution) * backfill_resolution; end_reserve = (end_reserve / backfill_resolution) * backfill_resolution; if (job_ptr->start_time > (sched_start + backfill_window)) { /* Starts too far in the future to worry about */ if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); if ((orig_start_time != 0) && (orig_start_time < job_ptr->start_time)) { /* Can start earlier in different partition */ job_ptr->start_time = orig_start_time; } _set_job_time_limit(job_ptr, orig_time_limit); continue; } if (node_space_recs >= max_backfill_job_cnt) { if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: table size limit of %u reached", max_backfill_job_cnt); } if ((max_backfill_job_per_part != 0) && (max_backfill_job_per_part >= max_backfill_job_cnt)) { error("bf_max_job_part >= bf_max_job_test (%u >= %u)", max_backfill_job_per_part, max_backfill_job_cnt); } else if ((max_backfill_job_per_user != 0) && (max_backfill_job_per_user > max_backfill_job_cnt)) { info("warning: bf_max_job_user > bf_max_job_test (%u > %u)", max_backfill_job_per_user, max_backfill_job_cnt); } else if ((max_backfill_job_per_assoc != 0) && (max_backfill_job_per_assoc > max_backfill_job_cnt)) { info("warning: bf_max_job_assoc > bf_max_job_test (%u > %u)", max_backfill_job_per_assoc, max_backfill_job_cnt); } _set_job_time_limit(job_ptr, orig_time_limit); break; } if ((job_ptr->start_time > now) && (job_ptr->state_reason != WAIT_BURST_BUFFER_RESOURCE) && (job_ptr->state_reason != WAIT_BURST_BUFFER_STAGING) && _test_resv_overlap(node_space, avail_bitmap, start_time, end_reserve)) { /* This job overlaps with an existing reservation for * job to be backfill scheduled, which the sched * plugin does not know about. Try again later. */ later_start = job_ptr->start_time; job_ptr->start_time = 0; if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: %pJ overlaps with existing reservation start_time=%u end_reserve=%u boot_time=%u later_start %ld", job_ptr, start_time, end_reserve, boot_time, later_start); } goto TRY_LATER; } if (_het_job_deadlock_test(job_ptr)) { _set_job_time_limit(job_ptr, orig_time_limit); continue; } /* * Add reservation to scheduling table if appropriate */ if (!assoc_limit_stop) { uint32_t selected_node_cnt; uint64_t tres_req_cnt[slurmctld_tres_cnt]; assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; selected_node_cnt = bit_set_count(avail_bitmap); memcpy(tres_req_cnt, job_ptr->tres_req_cnt, sizeof(tres_req_cnt)); tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)(job_ptr->total_cpus ? job_ptr->total_cpus : job_ptr->details->min_cpus); tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( job_ptr->job_resrcs, job_ptr->details->pn_min_memory, tres_req_cnt[TRES_ARRAY_CPU], selected_node_cnt); tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)selected_node_cnt; assoc_mgr_lock(&locks); gres_set_job_tres_cnt(job_ptr->gres_list, selected_node_cnt, tres_req_cnt, true); tres_req_cnt[TRES_ARRAY_BILLING] = assoc_mgr_tres_weighted( tres_req_cnt, job_ptr->part_ptr->billing_weights, slurmctld_conf.priority_flags, true); if (!acct_policy_job_runnable_post_select(job_ptr, tres_req_cnt, true)) { assoc_mgr_unlock(&locks); if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: adding reservation for %pJ blocked by acct_policy_job_runnable_post_select", job_ptr); } _set_job_time_limit(job_ptr, orig_time_limit); continue; } assoc_mgr_unlock(&locks); } if (debug_flags & DEBUG_FLAG_BACKFILL) _dump_job_sched(job_ptr, end_reserve, avail_bitmap); if (qos_flags & QOS_FLAG_NO_RESERVE) { _set_job_time_limit(job_ptr, orig_time_limit); continue; } if (bf_job_part_count_reserve) { if (_check_bf_usage( job_ptr->part_ptr->bf_data->resv_usage, bf_job_part_count_reserve, orig_sched_start)) { _set_job_time_limit(job_ptr, orig_time_limit); continue; } job_ptr->part_ptr->bf_data->resv_usage->count++; } /* Clear assumed rejected array status */ reject_array_job = NULL; reject_array_part = NULL; if ((orig_start_time == 0) || (job_ptr->start_time < orig_start_time)) { /* Can't start earlier in different partition. */ xfree(job_ptr->sched_nodes); job_ptr->sched_nodes = bitmap2node_name(avail_bitmap); } bit_not(avail_bitmap); if ((!bf_one_resv_per_job || !orig_start_time) && !(job_ptr->bit_flags & JOB_PROM)) { _add_reservation(start_time, end_reserve, avail_bitmap, node_space, &node_space_recs); } if (debug_flags & DEBUG_FLAG_BACKFILL_MAP) _dump_node_space_table(node_space); if ((orig_start_time != 0) && (orig_start_time < job_ptr->start_time)) { /* Can start earlier in different partition */ job_ptr->start_time = orig_start_time; } _set_job_time_limit(job_ptr, orig_time_limit); if (job_ptr->array_recs) { /* Try making reservation for next task of job array */ if (test_array_job_id != job_ptr->array_job_id) { test_array_job_id = job_ptr->array_job_id; test_array_count = 1; } else { test_array_count++; } /* * Don't consider the next task if it would exceed the * maximum number of runnable tasks. If max_run_tasks is * 0, then it wasn't set, so ignore it. */ if ((test_array_count < bf_max_job_array_resv) && (test_array_count < job_ptr->array_recs->task_cnt) && (!job_ptr->array_recs->max_run_tasks || (job_ptr->array_recs->pend_run_tasks < job_ptr->array_recs->max_run_tasks))) goto next_task; } } /* Restore preemption state if needed. */ _restore_preempt_state(job_ptr, &tmp_preempt_start_time, &tmp_preempt_in_progress); _het_job_deadlock_fini(); if (!bf_hetjob_immediate && (!max_backfill_jobs_start || (job_start_cnt < max_backfill_jobs_start))) _het_job_start_test(node_space, 0); FREE_NULL_BITMAP(avail_bitmap); FREE_NULL_BITMAP(exc_core_bitmap); FREE_NULL_BITMAP(resv_bitmap); for (i = 0; ; ) { FREE_NULL_BITMAP(node_space[i].avail_bitmap); if ((i = node_space[i].next) == 0) break; } xfree(node_space); FREE_NULL_LIST(job_queue); gettimeofday(&bf_time2, NULL); _do_diag_stats(&bf_time1, &bf_time2, node_space_recs); if (debug_flags & DEBUG_FLAG_BACKFILL) { END_TIMER; info("backfill: completed testing %u(%d) jobs, %s", slurmctld_diag_stats.bf_last_depth, job_test_count, TIME_STR); } slurm_mutex_lock(&slurmctld_config.thread_count_lock); if (slurmctld_config.server_thread_count >= 150) { info("backfill: %d pending RPCs at cycle end, consider " "configuring max_rpc_cnt", slurmctld_config.server_thread_count); } slurm_mutex_unlock(&slurmctld_config.thread_count_lock); return rc; } /* Try to start the job on any non-reserved nodes */ static int _start_job(job_record_t *job_ptr, bitstr_t *resv_bitmap) { int rc; bitstr_t *orig_exc_nodes = NULL; bool is_job_array_head = false; static uint32_t fail_jobid = 0; if (job_ptr->details->exc_node_bitmap) { orig_exc_nodes = bit_copy(job_ptr->details->exc_node_bitmap); bit_or(job_ptr->details->exc_node_bitmap, resv_bitmap); } else job_ptr->details->exc_node_bitmap = bit_copy(resv_bitmap); if (job_ptr->array_recs) is_job_array_head = true; rc = select_nodes(job_ptr, false, NULL, NULL, false, SLURMDB_JOB_FLAG_BACKFILL); if (is_job_array_head && job_ptr->details) { job_record_t *base_job_ptr; base_job_ptr = find_job_record(job_ptr->array_job_id); if (base_job_ptr && base_job_ptr != job_ptr && base_job_ptr->array_recs) { FREE_NULL_BITMAP( base_job_ptr->details->exc_node_bitmap); if (orig_exc_nodes) base_job_ptr->details->exc_node_bitmap = bit_copy(orig_exc_nodes); } } if (job_ptr->details) { /* select_nodes() might reset exc_node_bitmap */ FREE_NULL_BITMAP(job_ptr->details->exc_node_bitmap); job_ptr->details->exc_node_bitmap = orig_exc_nodes; } else FREE_NULL_BITMAP(orig_exc_nodes); if (rc == SLURM_SUCCESS) { /* job initiated */ last_job_update = time(NULL); info("backfill: Started %pJ in %s on %s", job_ptr, job_ptr->part_ptr->name, job_ptr->nodes); power_g_job_start(job_ptr); if (job_ptr->batch_flag == 0) srun_allocate(job_ptr); else if (!IS_JOB_CONFIGURING(job_ptr)) launch_job(job_ptr); slurmctld_diag_stats.backfilled_jobs++; slurmctld_diag_stats.last_backfilled_jobs++; if (job_ptr->het_job_id) slurmctld_diag_stats.backfilled_het_jobs++; if (debug_flags & DEBUG_FLAG_BACKFILL) { info("backfill: Jobs backfilled since boot: %u", slurmctld_diag_stats.backfilled_jobs); } } else if ((job_ptr->job_id != fail_jobid) && (rc != ESLURM_ACCOUNTING_POLICY)) { char *node_list; bit_not(resv_bitmap); node_list = bitmap2node_name(resv_bitmap); /* This happens when a job has sharing disabled and * a selected node is still completing some job, * which should be a temporary situation. */ verbose("backfill: Failed to start %pJ with %s avail: %s", job_ptr, node_list, slurm_strerror(rc)); xfree(node_list); fail_jobid = job_ptr->job_id; } else { debug3("backfill: Failed to start %pJ: %s", job_ptr, slurm_strerror(rc)); } return rc; } /* * Compute a job's maximum time based upon conflicts in resources * planned for use by other jobs and that job's min/max time limit * Return NO_VAL if no restriction */ static uint32_t _get_job_max_tl(job_record_t *job_ptr, time_t now, node_space_map_t *node_space) { int32_t j; time_t comp_time = 0; uint32_t max_tl = NO_VAL; if (job_ptr->time_min == 0) return max_tl; for (j = 0; ; ) { if ((node_space[j].begin_time != now) && // No current conflicts (node_space[j].begin_time < job_ptr->end_time) && (!bit_super_set(job_ptr->node_bitmap, node_space[j].avail_bitmap))) { /* Job overlaps pending job's resource reservation */ if ((comp_time == 0) || (comp_time > node_space[j].begin_time)) comp_time = node_space[j].begin_time; } if ((j = node_space[j].next) == 0) break; } if (comp_time != 0) max_tl = (comp_time - now + 59) / 60; return max_tl; } /* * Reset a job's time limit (and end_time) as high as possible * within the range job_ptr->time_min and job_ptr->time_limit. * Avoid using resources reserved for pending jobs or in resource * reservations */ static void _reset_job_time_limit(job_record_t *job_ptr, time_t now, node_space_map_t *node_space) { int32_t j, resv_delay; uint32_t orig_time_limit = job_ptr->time_limit; uint32_t new_time_limit; for (j = 0; ; ) { if ((node_space[j].begin_time != now) && // No current conflicts (node_space[j].begin_time < job_ptr->end_time) && (!bit_super_set(job_ptr->node_bitmap, node_space[j].avail_bitmap))) { /* Job overlaps pending job's resource reservation */ resv_delay = difftime(node_space[j].begin_time, now); resv_delay /= 60; /* seconds to minutes */ if (resv_delay < job_ptr->time_limit) job_ptr->time_limit = resv_delay; } if ((j = node_space[j].next) == 0) break; } new_time_limit = MAX(job_ptr->time_min, job_ptr->time_limit); acct_policy_alter_job(job_ptr, new_time_limit); job_ptr->time_limit = new_time_limit; job_ptr->end_time = job_ptr->start_time + (job_ptr->time_limit * 60); job_time_adj_resv(job_ptr); if (orig_time_limit != job_ptr->time_limit) { info("backfill: %pJ time limit changed from %u to %u", job_ptr, orig_time_limit, job_ptr->time_limit); } } /* Report if any changes occurred to job, node or partition information */ static bool _more_work (time_t last_backfill_time) { bool rc = false; slurm_mutex_lock( &thread_flag_mutex ); if ( (last_job_update >= last_backfill_time ) || (last_node_update >= last_backfill_time ) || (last_part_update >= last_backfill_time ) ) { rc = true; } slurm_mutex_unlock( &thread_flag_mutex ); return rc; } /* Create a reservation for a job in the future */ static void _add_reservation(uint32_t start_time, uint32_t end_reserve, bitstr_t *res_bitmap, node_space_map_t *node_space, int *node_space_recs) { bool placed = false; int i, j; #if 0 info("add job start:%u end:%u", start_time, end_reserve); for (j = 0; ; ) { info("node start:%u end:%u", (uint32_t) node_space[j].begin_time, (uint32_t) node_space[j].end_time); if ((j = node_space[j].next) == 0) break; } #endif start_time = MAX(start_time, node_space[0].begin_time); for (j = 0; ; ) { if (node_space[j].end_time > start_time) { /* insert start entry record */ i = *node_space_recs; node_space[i].begin_time = start_time; node_space[i].end_time = node_space[j].end_time; node_space[j].end_time = start_time; node_space[i].avail_bitmap = bit_copy(node_space[j].avail_bitmap); node_space[i].next = node_space[j].next; node_space[j].next = i; (*node_space_recs)++; placed = true; } if (node_space[j].end_time == start_time) { /* no need to insert new start entry record */ placed = true; } if (placed == true) { while ((j = node_space[j].next)) { if (end_reserve < node_space[j].end_time) { /* insert end entry record */ i = *node_space_recs; node_space[i].begin_time = end_reserve; node_space[i].end_time = node_space[j]. end_time; node_space[j].end_time = end_reserve; node_space[i].avail_bitmap = bit_copy(node_space[j]. avail_bitmap); node_space[i].next = node_space[j].next; node_space[j].next = i; (*node_space_recs)++; break; } if (end_reserve == node_space[j].end_time) { break; } } break; } if ((j = node_space[j].next) == 0) break; } for (j = 0; ; ) { if ((node_space[j].begin_time >= start_time) && (node_space[j].end_time <= end_reserve)) bit_and(node_space[j].avail_bitmap, res_bitmap); if ((node_space[j].begin_time >= end_reserve) || ((j = node_space[j].next) == 0)) break; } /* Drop records with identical bitmaps (up to one record). * This can significantly improve performance of the backfill tests. */ for (i = 0; ; ) { if ((j = node_space[i].next) == 0) break; if (!bit_equal(node_space[i].avail_bitmap, node_space[j].avail_bitmap)) { i = j; continue; } node_space[i].end_time = node_space[j].end_time; node_space[i].next = node_space[j].next; FREE_NULL_BITMAP(node_space[j].avail_bitmap); break; } } /* * Determine if the resource specification for a new job overlaps with a * reservation that the backfill scheduler has made for a job to be * started in the future. * IN use_bitmap - nodes to be allocated * IN start_time - start time of job * IN end_reserve - end time of job */ static bool _test_resv_overlap(node_space_map_t *node_space, bitstr_t *use_bitmap, uint32_t start_time, uint32_t end_reserve) { bool overlap = false; int j; for (j=0; ; ) { if ((node_space[j].end_time > start_time) && (node_space[j].begin_time < end_reserve) && (!bit_super_set(use_bitmap, node_space[j].avail_bitmap))) { overlap = true; break; } if ((j = node_space[j].next) == 0) break; } return overlap; } /* * Delete het_job_map_t record from het_job_list */ static void _het_job_map_del(void *x) { het_job_map_t *map = (het_job_map_t *) x; FREE_NULL_LIST(map->het_job_rec_list); xfree(map); } /* * Return 1 if a het_job_map_t record with a specific het_job_id is found. * Always return 1 if "key" is zero. */ static int _het_job_find_map(void *x, void *key) { het_job_map_t *map = (het_job_map_t *) x; uint32_t *het_job_id = (uint32_t *) key; if ((het_job_id == NULL) || (map->het_job_id == *het_job_id)) return 1; return 0; } /* * Return 1 if a het_job_rec_t record with a specific job_id is found. */ static int _het_job_find_rec(void *x, void *key) { het_job_rec_t *rec = (het_job_rec_t *) x; uint32_t *job_id = (uint32_t *) key; if (rec->job_id == *job_id) return 1; return 0; } /* * Remove vestigial elements from het_job_list. For still active element, * clear the previously computted start time. This is used to periodically clear * history so that heterogeneous jobs do not keep getting deferred based * upon old system state */ static void _het_job_start_clear(void) { het_job_map_t *map; ListIterator iter; iter = list_iterator_create(het_job_list); while ((map = (het_job_map_t *) list_next(iter))) { if (map->prev_start == 0) { list_delete_item(iter); } else { map->prev_start = 0; list_flush(map->het_job_rec_list); } } list_iterator_destroy(iter); } /* * For a given het_job_map_t record, determine the earliest that it can start, * which is the time at which it's latest starting component begins. The * "exclude_job_id" is used to exclude a hetjob component currntly being * tested to start, presumably in a different partition. */ static time_t _het_job_start_compute(het_job_map_t *map, uint32_t exclude_job_id) { ListIterator iter; het_job_rec_t *rec; time_t latest_start = map->prev_start; iter = list_iterator_create(map->het_job_rec_list); while ((rec = (het_job_rec_t *) list_next(iter))) { if (rec->job_id == exclude_job_id) continue; latest_start = MAX(latest_start, rec->latest_start); } list_iterator_destroy(iter); return latest_start; } /* * Return the earliest that a job can start based upon _other_ components of * that same heterogeneous job. Return 0 if no limitation. * * If the job's state reason is BeginTime (the way all hetjobs start) and that * time is passed, then clear the reason field. */ static time_t _het_job_start_find(job_record_t *job_ptr, time_t now) { het_job_map_t *map; time_t latest_start = (time_t) 0; if (job_ptr->het_job_id) { map = (het_job_map_t *) list_find_first(het_job_list, _het_job_find_map, &job_ptr->het_job_id); if (map) { latest_start = _het_job_start_compute(map, job_ptr->job_id); } /* * All hetjobs are submitted with a begin time in the future * so that all components can be submitted before any of them * are scheduled, but we want to clear the BeginTime reason * as soon as possible to avoid confusing users */ if (job_ptr->details->begin_time <= now) { if (job_ptr->state_reason == WAIT_TIME) { job_ptr->state_reason = WAIT_NO_REASON; last_job_update = now; } if (job_ptr->state_reason_prev == WAIT_TIME) { job_ptr->state_reason_prev = WAIT_NO_REASON; last_job_update = now; } } if (latest_start && (debug_flags & DEBUG_FLAG_HETJOB)) { long int delay = MAX(0, latest_start - time(NULL)); info("%pJ in partition %s expected to start in %ld secs", job_ptr, job_ptr->part_ptr->name, delay); } } return latest_start; } /* * Record the earliest that a hetjob component can start. If it can be * started in multiple partitions, we only record the earliest start time * for the job in any partition. */ static void _het_job_start_set(job_record_t *job_ptr, time_t latest_start, uint32_t comp_time_limit) { het_job_map_t *map; het_job_rec_t *rec; if (comp_time_limit == NO_VAL) comp_time_limit = job_ptr->time_limit; if (job_ptr->het_job_id) { map = (het_job_map_t *) list_find_first(het_job_list, _het_job_find_map, &job_ptr->het_job_id); if (map) { if (!map->comp_time_limit) { map->comp_time_limit = comp_time_limit; } else { map->comp_time_limit = MIN(map->comp_time_limit, comp_time_limit); } rec = list_find_first(map->het_job_rec_list, _het_job_find_rec, &job_ptr->job_id); if (rec && (rec->latest_start <= latest_start)) { /* * This job can start an earlier time in * some other partition, so ignore new info */ } else if (rec) { rec->latest_start = latest_start; rec->part_ptr = job_ptr->part_ptr; } else { rec = xmalloc(sizeof(het_job_rec_t)); rec->job_id = job_ptr->job_id; rec->job_ptr = job_ptr; rec->latest_start = latest_start; rec->part_ptr = job_ptr->part_ptr; list_append(map->het_job_rec_list, rec); } } else { rec = xmalloc(sizeof(het_job_rec_t)); rec->job_id = job_ptr->job_id; rec->job_ptr = job_ptr; rec->latest_start = latest_start; rec->part_ptr = job_ptr->part_ptr; map = xmalloc(sizeof(het_job_map_t)); map->comp_time_limit = comp_time_limit; map->het_job_id = job_ptr->het_job_id; map->het_job_rec_list = list_create(xfree_ptr); list_append(map->het_job_rec_list, rec); list_append(het_job_list, map); } if (debug_flags & DEBUG_FLAG_HETJOB) { time_t latest_start = _het_job_start_compute(map, 0); long int delay = MAX(0, latest_start - time(NULL)); info("%pJ in partition %s set to start in %ld secs", job_ptr, job_ptr->part_ptr->name, delay); } } } /* * Return TRUE if we have expected start times for all components of a hetjob * and all components are valid and runable. * * NOTE: This should never happen, but we will also start the job if all of the * other components are already running, */ static bool _het_job_full(het_job_map_t *map) { job_record_t *het_job_ptr, *job_ptr; ListIterator iter; bool rc = true; het_job_ptr = find_job_record(map->het_job_id); if (!het_job_ptr || !het_job_ptr->het_job_list || (!IS_JOB_RUNNING(het_job_ptr) && !_job_runnable_now(het_job_ptr))) { return false; } iter = list_iterator_create(het_job_ptr->het_job_list); while ((job_ptr = list_next(iter))) { if ((job_ptr->magic != JOB_MAGIC) || (job_ptr->het_job_id != map->het_job_id)) { rc = false; /* bad job pointer */ break; } if (IS_JOB_RUNNING(job_ptr)) continue; if (!list_find_first(map->het_job_rec_list, _het_job_find_rec, &job_ptr->job_id) || !_job_runnable_now(job_ptr)) { rc = false; break; } } list_iterator_destroy(iter); return rc; } /* * Determine if all components of a hetjob can be started now or are * prevented from doing so because of association or QOS limits. * Return true if they can all start. * * NOTE: That a hetjob passes this test does not mean that it will be able * to run. For example, this test assumues resource allocation at the CPU level. * If each task is allocated one core, with 2 CPUs, then the CPU limit test * would not be accurate. */ static bool _het_job_limit_check(het_job_map_t *map, time_t now) { job_record_t *job_ptr; het_job_rec_t *rec; ListIterator iter; int begun_jobs = 0, fini_jobs = 0, slurmctld_tres_size; bool runnable = true; uint32_t selected_node_cnt; uint64_t tres_req_cnt[slurmctld_tres_cnt]; uint64_t **tres_alloc_save = NULL; tres_alloc_save = xmalloc(sizeof(uint64_t *) * list_count(map->het_job_rec_list)); slurmctld_tres_size = sizeof(uint64_t) * slurmctld_tres_cnt; iter = list_iterator_create(map->het_job_rec_list); while ((rec = (het_job_rec_t *) list_next(iter))) { assoc_mgr_lock_t locks = { READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, READ_LOCK, NO_LOCK, NO_LOCK }; job_ptr = rec->job_ptr; job_ptr->part_ptr = rec->part_ptr; selected_node_cnt = job_ptr->node_cnt_wag; memcpy(tres_req_cnt, job_ptr->tres_req_cnt, slurmctld_tres_size); tres_req_cnt[TRES_ARRAY_CPU] = (uint64_t)(job_ptr->total_cpus ? job_ptr->total_cpus : job_ptr->details->min_cpus); tres_req_cnt[TRES_ARRAY_MEM] = job_get_tres_mem( job_ptr->job_resrcs, job_ptr->details->pn_min_memory, tres_req_cnt[TRES_ARRAY_CPU], selected_node_cnt); tres_req_cnt[TRES_ARRAY_NODE] = (uint64_t)selected_node_cnt; assoc_mgr_lock(&locks); gres_set_job_tres_cnt(job_ptr->gres_list, selected_node_cnt, tres_req_cnt, true); tres_req_cnt[TRES_ARRAY_BILLING] = assoc_mgr_tres_weighted( tres_req_cnt, job_ptr->part_ptr->billing_weights, slurmctld_conf.priority_flags, true); if (acct_policy_job_runnable_pre_select(job_ptr, true) && acct_policy_job_runnable_post_select(job_ptr, tres_req_cnt, true)) { assoc_mgr_unlock(&locks); tres_alloc_save[begun_jobs++] = job_ptr->tres_alloc_cnt; job_ptr->tres_alloc_cnt = xmalloc(slurmctld_tres_size); memcpy(job_ptr->tres_alloc_cnt, tres_req_cnt, slurmctld_tres_size); acct_policy_job_begin(job_ptr); } else { assoc_mgr_unlock(&locks); runnable = false; break; } } list_iterator_reset(iter); while ((rec = (het_job_rec_t *) list_next(iter))) { job_ptr = rec->job_ptr; if (begun_jobs > fini_jobs) { time_t end_time_exp = job_ptr->end_time_exp; time_t end_time = job_ptr->end_time; uint32_t job_state = job_ptr->job_state; /* Simulate normal job completion */ job_ptr->end_time_exp = now; job_ptr->end_time = job_ptr->start_time; job_ptr->job_state = JOB_COMPLETE | JOB_COMPLETING; acct_policy_job_fini(job_ptr); job_ptr->end_time_exp = end_time_exp; job_ptr->end_time = end_time; job_ptr->job_state = job_state; xfree(job_ptr->tres_alloc_cnt); job_ptr->tres_alloc_cnt = tres_alloc_save[fini_jobs++]; } } list_iterator_destroy(iter); xfree(tres_alloc_save); return runnable; } /* * Start all components of a hetjob now */ static int _het_job_start_now(het_job_map_t *map, node_space_map_t *node_space) { job_record_t *job_ptr; bitstr_t *avail_bitmap = NULL, *exc_core_bitmap = NULL; bitstr_t *resv_bitmap = NULL, *used_bitmap = NULL; het_job_rec_t *rec; ListIterator iter; int mcs_select, rc = SLURM_SUCCESS; bool resv_overlap = false; time_t now = time(NULL), start_res; uint32_t hard_limit; iter = list_iterator_create(map->het_job_rec_list); while ((rec = (het_job_rec_t *) list_next(iter))) { bool reset_time = false; job_ptr = rec->job_ptr; job_ptr->part_ptr = rec->part_ptr; /* * Identify the nodes which this job can use */ start_res = now; rc = job_test_resv(job_ptr, &start_res, true, &avail_bitmap, &exc_core_bitmap, &resv_overlap, false); FREE_NULL_BITMAP(exc_core_bitmap); if (rc != SLURM_SUCCESS) { error("%pJ failed to start due to reservation", job_ptr); FREE_NULL_BITMAP(avail_bitmap); break; } bit_and(avail_bitmap, job_ptr->part_ptr->node_bitmap); bit_and(avail_bitmap, up_node_bitmap); if (used_bitmap) bit_and_not(avail_bitmap, used_bitmap); filter_by_node_owner(job_ptr, avail_bitmap); mcs_select = slurm_mcs_get_select(job_ptr); filter_by_node_mcs(job_ptr, mcs_select, avail_bitmap); if (job_ptr->details->exc_node_bitmap) { bit_and_not(avail_bitmap, job_ptr->details->exc_node_bitmap); } if (fed_mgr_job_lock(job_ptr)) { error("%pJ failed to start due to fed job lock", job_ptr); FREE_NULL_BITMAP(avail_bitmap); continue; } resv_bitmap = avail_bitmap; avail_bitmap = NULL; bit_not(resv_bitmap); rc = _start_job(job_ptr, resv_bitmap); FREE_NULL_BITMAP(resv_bitmap); if (rc == SLURM_SUCCESS) { /* * If the following fails because of network * connectivity, the origin cluster should ask * when it comes back up if the cluster_lock * cluster actually started the job */ fed_mgr_job_start(job_ptr, job_ptr->start_time); if (debug_flags & DEBUG_FLAG_HETJOB) { info("%pJ started", job_ptr); } if (!used_bitmap && job_ptr->node_bitmap) used_bitmap = bit_copy(job_ptr->node_bitmap); else if (job_ptr->node_bitmap) bit_or(used_bitmap, job_ptr->node_bitmap); } else { fed_mgr_job_unlock(job_ptr); break; } if (job_ptr->time_min) { /* Set time limit as high as possible */ acct_policy_alter_job(job_ptr, map->comp_time_limit); job_ptr->time_limit = map->comp_time_limit; reset_time = true; } if (job_ptr->start_time) { if (job_ptr->time_limit == INFINITE) hard_limit = YEAR_SECONDS; else hard_limit = job_ptr->time_limit * 60; job_ptr->end_time = job_ptr->start_time + hard_limit; /* * Only set if start_time. end_time must be set * beforehand for _reset_job_time_limit. */ if (reset_time) _reset_job_time_limit(job_ptr, now, node_space); } if (reset_time) jobacct_storage_job_start_direct(acct_db_conn, job_ptr); } list_iterator_destroy(iter); FREE_NULL_BITMAP(used_bitmap); return rc; } /* * Deallocate all components if failed hetjob start */ static void _het_job_kill_now(het_job_map_t *map) { job_record_t *job_ptr; het_job_rec_t *rec; ListIterator iter; time_t now = time(NULL); int cred_lifetime = 1200; uint32_t save_bitflags; (void) slurm_cred_ctx_get(slurmctld_config.cred_ctx, SLURM_CRED_OPT_EXPIRY_WINDOW, &cred_lifetime); iter = list_iterator_create(map->het_job_rec_list); while ((rec = (het_job_rec_t *) list_next(iter))) { job_ptr = rec->job_ptr; if (IS_JOB_PENDING(job_ptr)) continue; info("Deallocate %pJ due to hetjob start failure", job_ptr); job_ptr->details->begin_time = now + cred_lifetime + 1; job_ptr->end_time = now; job_ptr->job_state = JOB_PENDING | JOB_COMPLETING; last_job_update = now; build_cg_bitmap(job_ptr); job_completion_logger(job_ptr, false); deallocate_nodes(job_ptr, false, false, false); /* * Since the job_completion_logger() removes the submit, * we need to add it again, but don't stage-out burst buffer */ save_bitflags = job_ptr->bit_flags; job_ptr->bit_flags |= JOB_KILL_HURRY; acct_policy_add_job_submit(job_ptr); job_ptr->bit_flags = save_bitflags; if (!job_ptr->node_bitmap_cg || (bit_set_count(job_ptr->node_bitmap_cg) == 0)) batch_requeue_fini(job_ptr); } list_iterator_destroy(iter); } /* * If all components of a heterogeneous job can start now, then do so * node_space IN - map of available resources through time * map IN - info about this heterogeneous job * single IN - true if testing single heterogeneous jobs */ static void _het_job_start_test_single(node_space_map_t *node_space, het_job_map_t *map, bool single) { time_t now = time(NULL); int rc; if (!map) return; if (!_het_job_full(map)) { if (debug_flags & DEBUG_FLAG_HETJOB) { info("Hetjob %u has indefinite start time", map->het_job_id); } if (!single) map->prev_start = now + YEAR_SECONDS; return; } map->prev_start = _het_job_start_compute(map, 0); if (map->prev_start > now) { if (debug_flags & DEBUG_FLAG_HETJOB) { info("Hetjob %u should be able to start in %u seconds", map->het_job_id, (uint32_t) (map->prev_start - now)); } return; } if (!_het_job_limit_check(map, now)) { if (debug_flags & DEBUG_FLAG_HETJOB) { info("Hetjob %u prevented from starting by account/QOS limit", map->het_job_id); } map->prev_start = now + YEAR_SECONDS; return; } if (debug_flags & DEBUG_FLAG_HETJOB) info("Attempting to start hetjob %u", map->het_job_id); rc = _het_job_start_now(map, node_space); if (rc != SLURM_SUCCESS) { if (debug_flags & DEBUG_FLAG_HETJOB) { info("Failed to start hetjob %u", map->het_job_id); } _het_job_kill_now(map); } else { job_start_cnt += list_count(map->het_job_rec_list); if (max_backfill_jobs_start && (job_start_cnt >= max_backfill_jobs_start)) { if (debug_flags & DEBUG_FLAG_BACKFILL) info("backfill: bf_max_job_start limit of %d reached", max_backfill_jobs_start); } } } static int _het_job_start_test_list(void *map, void *node_space) { if (!max_backfill_jobs_start || (job_start_cnt < max_backfill_jobs_start)) _het_job_start_test_single(node_space, map, false); return SLURM_SUCCESS; } /* * If all components of a heterogeneous job can start now, then do so * node_space IN - map of available resources through time * het_job_id IN - the ID of the heterogeneous job to evaluate, * if zero then evaluate all heterogeneous jobs */ static void _het_job_start_test(node_space_map_t *node_space, uint32_t het_job_id) { het_job_map_t *map = NULL; if (!het_job_id) { /* Test all maps. */ (void)list_for_each(het_job_list, _het_job_start_test_list, node_space); } else { /* Test single map. */ map = (het_job_map_t *)list_find_first(het_job_list, _het_job_find_map, &het_job_id); _het_job_start_test_single(node_space, map, true); } } static void _deadlock_global_list_del(void *x) { deadlock_part_struct_t *dl_part_ptr = (deadlock_part_struct_t *) x; FREE_NULL_LIST(dl_part_ptr->deadlock_job_list); xfree(dl_part_ptr); } static int _deadlock_part_list_srch(void *x, void *key) { deadlock_job_struct_t *dl_job = (deadlock_job_struct_t *) x; job_record_t *job_ptr = (job_record_t *) key; if (dl_job->het_job_id == job_ptr->het_job_id) return 1; return 0; } static int _deadlock_part_list_srch2(void *x, void *key) { deadlock_job_struct_t *dl_job = (deadlock_job_struct_t *) x; deadlock_job_struct_t *dl_job2 = (deadlock_job_struct_t *) key; if (dl_job->het_job_id == dl_job2->het_job_id) return 1; return 0; } static int _deadlock_global_list_srch(void *x, void *key) { deadlock_part_struct_t *dl_part = (deadlock_part_struct_t *) x; if (dl_part->part_ptr == (part_record_t *) key) return 1; return 0; } static int _deadlock_job_list_sort(void *x, void *y) { deadlock_job_struct_t *dl_job_ptr1 = *(deadlock_job_struct_t **) x; deadlock_job_struct_t *dl_job_ptr2 = *(deadlock_job_struct_t **) y; if (dl_job_ptr1->start_time > dl_job_ptr2->start_time) return -1; else if (dl_job_ptr1->start_time < dl_job_ptr2->start_time) return 1; return 0; } /* * Call at end of backup execution to release memory allocated by * _het_job_deadlock_test() */ static void _het_job_deadlock_fini(void) { FREE_NULL_LIST(deadlock_global_list); } /* * Determine if job can run at it's "start_time" or later. * job_ptr IN - job to test, set reason to "HET_JOB_DEADLOCK" if it will deadlock * RET true if the job can not run due to possible deadlock with other hetjob * * NOTE: If there are a large number of hetjobs this will be painfully slow * as the algorithm must be order n^2 */ static bool _het_job_deadlock_test(job_record_t *job_ptr) { deadlock_job_struct_t *dl_job_ptr = NULL, *dl_job_ptr2 = NULL; deadlock_job_struct_t *dl_job_ptr3 = NULL; deadlock_part_struct_t *dl_part_ptr = NULL, *dl_part_ptr2 = NULL; ListIterator job_iter, part_iter; bool have_deadlock = false; if (!job_ptr->het_job_id || !job_ptr->part_ptr) return false; /* * Find the list representing the ordering of jobs in this specific * partition and add this job in the list, sorted by job start time */ if (!deadlock_global_list) { deadlock_global_list = list_create(_deadlock_global_list_del); } else { dl_part_ptr = list_find_first(deadlock_global_list, _deadlock_global_list_srch, job_ptr->part_ptr); } if (!dl_part_ptr) { dl_part_ptr = xmalloc(sizeof(deadlock_part_struct_t)); dl_part_ptr->deadlock_job_list = list_create(xfree_ptr); dl_part_ptr->part_ptr = job_ptr->part_ptr; list_append(deadlock_global_list, dl_part_ptr); } else { dl_job_ptr = list_find_first(dl_part_ptr->deadlock_job_list, _deadlock_part_list_srch, job_ptr); } if (!dl_job_ptr) { dl_job_ptr = xmalloc(sizeof(deadlock_job_struct_t)); dl_job_ptr->het_job_id = job_ptr->het_job_id; dl_job_ptr->start_time = job_ptr->start_time; list_append(dl_part_ptr->deadlock_job_list, dl_job_ptr); } else if (dl_job_ptr->start_time < job_ptr->start_time) { dl_job_ptr->start_time = job_ptr->start_time; } list_sort(dl_part_ptr->deadlock_job_list, _deadlock_job_list_sort); /* * Log current table of hetjob start times by partition */ if (debug_flags & DEBUG_FLAG_BACKFILL) { part_iter = list_iterator_create(deadlock_global_list); while ((dl_part_ptr2 = (deadlock_part_struct_t *) list_next(part_iter))){ info("Partition %s Hetjobs:", dl_part_ptr2->part_ptr->name); job_iter = list_iterator_create(dl_part_ptr2-> deadlock_job_list); while ((dl_job_ptr2 = (deadlock_job_struct_t *) list_next(job_iter))) { info(" Hetjob %u to start at %"PRIu64, dl_job_ptr2->het_job_id, (uint64_t) dl_job_ptr2->start_time); } list_iterator_destroy(job_iter); } list_iterator_destroy(part_iter); } /* * Determine if any hetjobs scheduled to start earlier than this job * in this partition are scheduled to start after it in some other * partition */ part_iter = list_iterator_create(deadlock_global_list); while ((dl_part_ptr2 = (deadlock_part_struct_t *)list_next(part_iter))){ if (dl_part_ptr2 == dl_part_ptr) /* Current partition, skip it */ continue; dl_job_ptr2 = list_find_first(dl_part_ptr2->deadlock_job_list, _deadlock_part_list_srch, job_ptr); if (!dl_job_ptr2) /* Hetjob not in this partition, no check */ continue; job_iter = list_iterator_create(dl_part_ptr->deadlock_job_list); while ((dl_job_ptr2 = (deadlock_job_struct_t *) list_next(job_iter))) { if (dl_job_ptr2->het_job_id == dl_job_ptr->het_job_id) break; /* Self */ dl_job_ptr3 = list_find_first( dl_part_ptr2->deadlock_job_list, _deadlock_part_list_srch2, dl_job_ptr2); if (dl_job_ptr3 && (dl_job_ptr3->start_time < dl_job_ptr->start_time)){ have_deadlock = true; break; } } list_iterator_destroy(job_iter); if (have_deadlock && (debug_flags & DEBUG_FLAG_BACKFILL)) { info("Hetjob %u in partition %s would deadlock " "with hetjob %u in partition %s, skipping it", dl_job_ptr->het_job_id, dl_part_ptr->part_ptr->name, dl_job_ptr3->het_job_id, dl_part_ptr2->part_ptr->name); } if (have_deadlock) break; } list_iterator_destroy(part_iter); return have_deadlock; }