/* * OpenPBS (Portable Batch System) v2.3 Software License * * Copyright (c) 1999-2000 Veridian Information Solutions, Inc. * All rights reserved. * * --------------------------------------------------------------------------- * For a license to use or redistribute the OpenPBS software under conditions * other than those described below, or to purchase support for this software, * please contact Veridian Systems, PBS Products Department ("Licensor") at: * * www.OpenPBS.org +1 650 967-4675 sales@OpenPBS.org * 877 902-4PBS (US toll-free) * --------------------------------------------------------------------------- * * This license covers use of the OpenPBS v2.3 software (the "Software") at * your site or location, and, for certain users, redistribution of the * Software to other sites and locations. Use and redistribution of * OpenPBS v2.3 in source and binary forms, with or without modification, * are permitted provided that all of the following conditions are met. * After December 31, 2001, only conditions 3-6 must be met: * * 3. Any Redistribution of source code must retain the above copyright notice * and the acknowledgment contained in paragraph 6, this list of conditions * and the disclaimer contained in paragraph 7. * * 4. Any Redistribution in binary form must reproduce the above copyright * notice and the acknowledgment contained in paragraph 6, this list of * conditions and the disclaimer contained in paragraph 7 in the * documentation and/or other materials provided with the distribution. * * 5. Redistributions in any form must be accompanied by information on how to * obtain complete source code for the OpenPBS software and any * modifications and/or additions to the OpenPBS software. The source code * must either be included in the distribution or be available for no more * than the cost of distribution plus a nominal fee, and all modifications * and additions to the Software must be freely redistributable by any party * (including Licensor) without restriction. * * 6. All advertising materials mentioning features or use of the Software must * display the following acknowledgment: * * "This product includes software developed by NASA Ames Research Center, * Lawrence Livermore National Laboratory, and Veridian Information * Solutions, Inc. * Visit www.OpenPBS.org for OpenPBS software support, * products, and information." * * 7. DISCLAIMER OF WARRANTY * * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT * ARE EXPRESSLY DISCLAIMED. * * IN NO EVENT SHALL VERIDIAN CORPORATION, ITS AFFILIATED COMPANIES, OR THE * U.S. GOVERNMENT OR ANY OF ITS AGENCIES BE LIABLE FOR ANY DIRECT OR INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This license will be governed by the laws of the Commonwealth of Virginia, * without reference to its choice of law rules. */ #include #include #include /* the master config generated by configure */ #include "pbsd_init.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _CRAY #include #endif /* _CRAY */ #include #include #include #include "pbs_ifl.h" #include "log.h" #include "../lib/Liblog/pbs_log.h" #include "../lib/Liblog/log_event.h" #include "../lib/Liblog/setup_env.h" #include "../lib/Liblog/chk_file_sec.h" #include "../lib/Libifl/lib_ifl.h" #include "list_link.h" #include "attribute.h" #include "server_limits.h" #include "server.h" #include "pbs_job.h" #include "resource.h" #include "work_task.h" #include "tracking.h" #include "svrfunc.h" #include "acct.h" #include "net_connect.h" #include "pbs_proto.h" #include "batch_request.h" #include "array.h" #include "csv.h" #include "pbs_nodes.h" #include "threadpool.h" #include "../lib/Libutils/u_lock_ctl.h" /* unlock_node */ #include "queue_recov.h" /* que_recov_xml */ #include "dynamic_string.h" #include "utils.h" #include "queue_recycler.h" /* queue_recycler */ #include "svr_func.h" /* get_svr_attr_* */ #include "login_nodes.h" #include "track_alps_reservations.h" #include "job_func.h" /* svr_job_purge */ #include "net_cache.h" #include "ji_mutex.h" #include "user_info.h" #include "hash_map.h" #include "mutex_mgr.hpp" #include "../lib/Libnet/lib_net.h" #include "alps_constants.h" /*#ifndef SIGKILL*/ /* there is some weird stuff in gcc include files signal.h & sys/params.h */ #include /*#endif*/ #ifndef TRUE #define TRUE 1 #endif /* TRUE */ #ifndef FALSE #define FALSE 0 #endif /* FALSE */ /* global Data Items */ struct addrinfo hints; extern char *msg_daemonname; extern char *msg_init_abt; extern char *msg_init_queued; extern char *msg_init_substate; extern char *msg_err_noqueue; extern char *msg_err_malloc; extern char *msg_init_noqueues; extern char *msg_init_recovque; extern char *msg_init_expctq; extern char *msg_init_nojobs; extern char *msg_init_exptjobs; extern char *msg_init_norerun; extern char *msg_init_unkstate; extern char *msg_init_baddb; extern char *msg_init_chdir; extern char *msg_init_badjob; extern char *msg_script_open; extern char *acct_file; extern char *log_file; extern char *job_log_file; extern char *path_home; extern char *path_acct; extern char path_log[]; extern char *path_priv; extern char *path_arrays; extern char *path_jobs; extern char *path_credentials; extern char *path_queues; extern char *path_spool; extern char *path_svrdb; extern char *path_svrdb_new; extern char *path_svrlog; extern char *path_track; extern char *path_nodes; extern char *path_mom_hierarchy; extern char *path_nodes_new; extern char *path_nodestate; extern char *path_nodenote; extern char *path_nodenote_new; extern char *path_checkpoint; extern char *path_jobinfo_log; extern int queue_rank; extern char server_name[]; extern tlist_head svr_newnodes; extern std::list *task_list_timed; extern pthread_mutex_t task_list_timed_mutex; task_recycler tr; extern struct all_jobs alljobs; extern struct all_jobs array_summary; extern struct all_jobs newjobs; all_queues svr_queues; job_recycler recycler; queue_recycler q_recycler; hash_map *exiting_jobs_info; dynamic_string *hierarchy_holder; hello_container hellos; hello_container failures; reservation_holder alps_reservations; batch_request_holder brh; extern pthread_mutex_t *acctfile_mutex; pthread_mutex_t *scheduler_sock_jobct_mutex; extern int scheduler_sock; extern int scheduler_jobct; extern pthread_mutex_t *svr_do_schedule_mutex; extern pthread_mutex_t *listener_command_mutex; extern pthread_mutex_t *node_state_mutex; extern pthread_mutex_t *check_tasks_mutex; extern pthread_mutex_t *reroute_job_mutex; extern mom_hierarchy_t *mh; extern int a_opt_init; extern int LOGLEVEL; extern char *plogenv; extern bool auto_send_hierarchy; extern struct server server; /* External Functions Called */ void poll_job_task(work_task *); extern void on_job_rerun_task(struct work_task *); extern void set_resc_assigned(job *, enum batch_op); extern void set_old_nodes(job *); extern void acct_close(void); extern struct work_task *apply_job_delete_nanny(struct job *, int); extern int net_move(job *, struct batch_request *); void on_job_exit_task(struct work_task *); /* Private functions in this file */ void init_abt_job(job *); char *build_path(char *, const char *, const char *); void catch_abort(int); void change_logs(); void change_logs_handler(int); void change_log_level(int); int chk_save_file(char *); int pbsd_init_job(job *, int); int pbsd_init_reque(job *, int); void resume_net_move(struct work_task *); void rm_files(char *); void stop_me(int); void change_logs_handler(int sig); /* private data */ int run_change_logs = FALSE; #define CHANGE_STATE 1 #define KEEP_STATE 0 /** * Initialize a dynamic array to a specific size * @param Array (O) Assumed to be uninitialized struct * @param InitialSize (I) raised to 0 if less than 0 */ int DArrayInit( darray_t *Array, /* I */ int InitialSize) /* I */ { if (InitialSize <= 0) { Array->Length = 0; Array->Data = NULL; } else { Array->Length = InitialSize; Array->Data = (void **)calloc(sizeof(Array->Data[0]), InitialSize); if (Array->Data == NULL) return(FAILURE); } Array->AppendIndex = 0; return(SUCCESS); } /*END DArrayInit */ /** * Free the resources associated with Array * It does NOT free any data stored in the array, just the array structure itself. * param Array (I) */ int DArrayFree( darray_t *Array) /* I */ { free(Array->Data); Array->Data = NULL; Array->Length = 0; Array->AppendIndex = 0; return(SUCCESS); } /*END DArrayFree */ /** * Append Item onto the end of Array, resizing it if necessary * @param Array (I/O) * @param Item (I) */ int DArrayAppend( darray_t *Array, /* I/O */ void *Item) /* I */ { void **tmp = NULL; if (Array->AppendIndex >= Array->Length) { int newLength = Array->Length * 2; if (newLength <= 10) newLength = 10; tmp = (void **)calloc(newLength, sizeof(Array->Data[0])); if (tmp == NULL) { free(Array->Data); Array->Length = 0; Array->AppendIndex = 0; return(FAILURE); } memcpy(tmp, Array->Data, sizeof(Array->Data[0]) * Array->Length); free(Array->Data); Array->Data = tmp; Array->Length = newLength; } Array->Data[Array->AppendIndex++] = Item; return(SUCCESS); } /* END DArrayAppend */ /** * Sort two job structs by their priority in ascending order * @param A (I) * @param B (I) */ int SortPrioAscend( const void *A, /* I */ const void *B) /* I */ { job *pjob1 = *((job **)A); job *pjob2 = *((job **)B); int prio1 = pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long; int prio2 = pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long; return(prio1 - prio2); } /*END SortPrioAscend */ void update_default_np() { struct pbsnode *pnode; int iter = -1; long default_np = 0; get_svr_attr_l(SRV_ATR_NPDefault, &default_np); if (default_np > 0) { while ((pnode = next_host(&allnodes,&iter,NULL)) != NULL) { while (pnode->nd_slots.get_total_execution_slots() < default_np) add_execution_slot(pnode); unlock_node(pnode, __func__, NULL, LOGLEVEL); } } return; } /* END update_default_np() */ /* Add the server names from /var/spool/torque/server_name to the trusted hosts list. */ void add_server_names_to_acl_hosts(void) { int n; int list_len; int rc; char *server_list_ptr; char *tp; char buffer[PBS_MAXSERVERNAME+1]; pbs_attribute temp; pbs_attribute *patr = &server.sv_attr[SRV_ATR_acl_hosts]; memset(buffer, 0, PBS_MAXSERVERNAME+1); memset(&temp, 0, sizeof(pbs_attribute)); server_list_ptr = pbs_get_server_list(); list_len = csv_length(server_list_ptr); for (n = 0; n < list_len; n++) { tp = csv_nth(server_list_ptr, n); if (tp) { snprintf(buffer, sizeof(buffer), "%s", tp); if ((tp = strchr(buffer, ':'))) /* Don't include any port specification */ *tp = 0; if ((rc = decode_arst_direct(&temp, buffer)) != 0) { return; } set_arst(patr, &temp, DECR); /* First make sure that the strings are not there. */ set_arst(patr, &temp, INCR); free_arst(&temp); } } return; } dynamic_string *make_default_hierarchy() { struct pbsnode *pnode; dynamic_string *default_hierarchy; dynamic_string *level_ds; int iter = -1; char buf[MAXLINE]; if (((default_hierarchy = get_dynamic_string(-1, NULL)) == NULL) || ((level_ds = get_dynamic_string(-1, NULL)) == NULL)) { log_err(ENOMEM, __func__, "Cannot allocate memory"); return(NULL); } copy_to_end_of_dynamic_string(default_hierarchy, ""); copy_to_end_of_dynamic_string(default_hierarchy, ""); while ((pnode = next_host(&allnodes, &iter, NULL)) != NULL) { if (level_ds->used > 0) append_dynamic_string(level_ds, ","); append_dynamic_string(level_ds, pnode->nd_name); if (PBS_MANAGER_SERVICE_PORT != pnode->nd_mom_rm_port) { snprintf(buf, sizeof(buf), ":%d", (int)pnode->nd_mom_rm_port); append_dynamic_string(level_ds, buf); } pnode->nd_hierarchy_level = 0; unlock_node(pnode, __func__, NULL, LOGLEVEL); } copy_to_end_of_dynamic_string(default_hierarchy, level_ds->str); copy_to_end_of_dynamic_string(default_hierarchy, ""); copy_to_end_of_dynamic_string(default_hierarchy, ""); free_dynamic_string(level_ds); return(default_hierarchy); } /* END make_default_hierarchy() */ int can_resolve_hostname( char *hostname) { char *colon; struct addrinfo *addr_info; int can_resolve = FALSE; if ((colon = strchr(hostname, ':')) != NULL) *colon = '\0'; if (get_cached_addrinfo(hostname) != NULL) can_resolve = TRUE; else if (pbs_getaddrinfo(hostname, NULL, &addr_info) == 0) { can_resolve = TRUE; insert_addr_name_info(addr_info,hostname); } if (colon != NULL) *colon = ':'; return(can_resolve); } /* END can_resolve_hostname() */ /* * check_if_in_nodes_file() * When parsing the mom_hierarchy file, make sure that the nodes found there * are also present in the nodes file, and create the nodes if they don't exist already. * Also, mark nodes as having been found in the hierarchy file so that they the hierarchy * can be checked for completeness later. * * @pre-cond: hostname must be a valid char pointer * @pre-cond: the nodes file must be parsed before the mom hierarchy * @post-cond: any nodes in the hierarchy file that aren't in the nodes file are created * @post-cond: rm_port has the mom's rm port stored in it */ void check_if_in_nodes_file( char *hostname, int level_index, unsigned short &rm_port) { char log_buf[LOCAL_LOG_BUF_SIZE]; struct pbsnode *pnode; char *colon; struct addrinfo *addr_info; struct sockaddr_in *sai; unsigned long ipaddr; if ((colon = strchr(hostname, ':')) != NULL) *colon = '\0'; if ((pnode = find_nodebyname(hostname)) == NULL) { snprintf(log_buf, sizeof(log_buf), "Node %s found in mom_hierarchy but not found in nodes file. Adding", hostname); log_err(-1, __func__, log_buf); if ((sai = get_cached_addrinfo(hostname)) == NULL) { if (pbs_getaddrinfo(hostname, NULL, &addr_info) == 0) { sai = (struct sockaddr_in *)addr_info->ai_addr; ipaddr = ntohl(sai->sin_addr.s_addr); insert_addr_name_info(addr_info, hostname); } else { log_err(errno, __func__, "getaddrinfo failed"); return; } } else ipaddr = ntohl(sai->sin_addr.s_addr); create_partial_pbs_node(hostname, ipaddr, ATR_DFLAG_MGRD | ATR_DFLAG_MGWR); pnode = find_nodebyname(hostname); if (pnode == NULL) { snprintf(log_buf, sizeof(log_buf), "Failed to add node %s to nodes file.", hostname); log_err(-1, __func__, log_buf); return; } } rm_port = pnode->nd_mom_rm_port; pnode->nd_in_hierarchy = TRUE; if (pnode->nd_hierarchy_level > level_index) pnode->nd_hierarchy_level = level_index; unlock_node(pnode, __func__, NULL, LOGLEVEL); if (colon != NULL) *colon = ':'; } /* END check_if_in_nodes_file() */ /* * convert_level_to_send_format() * * @pre-cond: level must be a valid resizable array of node_comm_t * @post-cond: all nodes at this level are added to send format in the format for sending */ void convert_level_to_send_format( resizable_array *level, int level_index, dynamic_string *send_format) { node_comm_t *nc; std::stringstream level_string; int node_iter = -1; copy_to_end_of_dynamic_string(send_format, ""); while ((nc = (node_comm_t *)next_thing(level, &node_iter)) != NULL) { unsigned short rm_port = 0; if (level_string.str().size() != 0) level_string << ","; check_if_in_nodes_file(nc->name, level_index, rm_port); level_string << nc->name; if (rm_port != PBS_MANAGER_SERVICE_PORT) { level_string << ":"; level_string << rm_port; } } copy_to_end_of_dynamic_string(send_format, level_string.str().c_str()); copy_to_end_of_dynamic_string(send_format, ""); } /* END convert_level_to_send_format() */ /* * convert_path_to_send_format() * iterates over each level in the path and adds in to send format appropriately. * * @pre-cond: path must be a valid resizable array of resizable arrays. * @post-cond: this path is added to send_format in the correct format. */ void convert_path_to_send_format( resizable_array *path, dynamic_string *send_format) { int levels_iter = -1; resizable_array *level; copy_to_end_of_dynamic_string(send_format, ""); while ((level = (resizable_array *)next_thing(path, &levels_iter)) != NULL) convert_level_to_send_format(level, levels_iter, send_format); copy_to_end_of_dynamic_string(send_format, ""); } /* END convert_path_to_send_format() */ /* * add_missing_nodes() * * @pre-cond: nodes have been marked if they are in the hierarchy or not * (i.e.: check_if_in_nodes_file() has been called for all nodes in the hierarchy) * @post-cond: any nodes not in the hierarchy are added in a new path, all at level 1 */ void add_missing_nodes( dynamic_string *send_format) { struct pbsnode *pnode; bool found_missing_node = false; int iter = -1; char log_buf[LOCAL_LOG_BUF_SIZE]; std::string level_string = ""; /* check if there are nodes that weren't in the hierarchy file that are in the nodes file */ while ((pnode = next_host(&allnodes, &iter, NULL)) != NULL) { if (pnode->nd_in_hierarchy == FALSE) { if (found_missing_node == false) { copy_to_end_of_dynamic_string(send_format, ""); copy_to_end_of_dynamic_string(send_format, ""); found_missing_node = true; level_string += pnode->nd_name; } else { level_string += ","; level_string += pnode->nd_name; } snprintf(log_buf, sizeof(log_buf), "Node %s found in the nodes file but not in the mom_hierarchy file. Making it a level 1 node", pnode->nd_name); pnode->nd_hierarchy_level = 0; log_err( -1, __func__, log_buf); } unlock_node(pnode, __func__, NULL, LOGLEVEL); } if (found_missing_node == true) { copy_to_end_of_dynamic_string(send_format, level_string.c_str()); copy_to_end_of_dynamic_string(send_format, ""); copy_to_end_of_dynamic_string(send_format, ""); } } /* END add_missing_nodes() */ /* * convert_mom_hierarchy_to_send_format() * iterates over the mom_hierarchy struct and adds each node to send_format * in the format for sending. * */ dynamic_string *convert_mom_hierarchy_to_send_format() { resizable_array *path; int paths_iter = -1; dynamic_string *send_format = get_dynamic_string(-1, NULL); while ((path = (resizable_array *)next_thing(mh->paths, &paths_iter)) != NULL) convert_path_to_send_format(path, send_format); if (send_format->used == 0) { free_dynamic_string(send_format); send_format = make_default_hierarchy(); } else add_missing_nodes(send_format); return(send_format); } /* * prepare_mom_hierarchy() * opens the mom hierarchy file, creates a mom hierarchy, and places it into a format * to be sent to the mom nodes. * if no hierarchy file exists or if it cannot be parsed, all of the nodes are placed * into a default hierarchy with all nodes at level 1. * * @pre-cond: nodes file has been parsed. * @post-cond: send_format is populated so that the hierarchy can be sent. */ dynamic_string *prepare_mom_hierarchy() { char log_buf[LOCAL_LOG_BUF_SIZE]; int fds; dynamic_string *send_format = NULL; mh = initialize_mom_hierarchy(); if ((fds = open(path_mom_hierarchy, O_RDONLY, 0)) < 0) { if (errno == ENOENT) { /* Each node is a top level node */ send_format = make_default_hierarchy(); return(send_format); } snprintf(log_buf, sizeof(log_buf), "Unable to open %s", path_mom_hierarchy); log_err(errno, __func__, log_buf); } else { parse_mom_hierarchy(fds); send_format = convert_mom_hierarchy_to_send_format(); } if (fds >= 0) close(fds); return(send_format); } /* END prepare_mom_hierarchy() */ int get_insertion_point( struct pbsnode *pnode, int *indices) { int i; int level = pnode->nd_hierarchy_level; int insertion_point = 0; for (i = level - 1; i >= 0; i--) { if (indices[i] != 0) { insertion_point = indices[i]; break; } } return(insertion_point); } /* END get_insertion_point() */ void add_all_nodes_to_hello_container() { struct pbsnode *pnode; int iter = -1; int level_indices[MAX_LEVEL_DEPTH]; int insertion_index; char *node_name_dup; memset(level_indices, 0, sizeof(level_indices)); while ((pnode = next_host(&allnodes, &iter, NULL)) != NULL) { if ((node_name_dup = strdup(pnode->nd_name)) != NULL) { /* make sure to insert things in order */ if (level_indices[pnode->nd_hierarchy_level] == 0) { insertion_index = get_insertion_point(pnode, level_indices); level_indices[pnode->nd_hierarchy_level] = add_hello_after(&hellos, node_name_dup, insertion_index); } else add_hello_after(&hellos, node_name_dup, level_indices[pnode->nd_hierarchy_level]); } unlock_node(pnode, __func__, NULL, LOGLEVEL); } return; } /* END add_all_nodes_to_hello_container() */ int get_default_threads() { int default_threads = DEFAULT_MIN_THREADS; int count = 0; char label[128]; char log_buf[LOCAL_LOG_BUF_SIZE]; FILE *fp; if ((fp = fopen("/proc/cpuinfo", "r")) != NULL) { /* if we can determine the number of cores, make * the default number of threads 2 * cores + 1 */ while (!feof(fp)) { if (fscanf(fp, "%s %*[^\n]%*c", label) == 0) { getc(fp); /* must do something to get to eof */ } else if (strcmp("processor", label) == 0) count++; } if (count > 0) default_threads = (2 * count) + 1; fclose(fp); } snprintf(log_buf, sizeof(log_buf), "Defaulting min_threads to %d threads", default_threads); log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, log_buf); return(default_threads); } /* END get_default_threads() */ int setup_limits() { #ifndef DEBUG #ifndef _CRAY struct rlimit rlimit; #endif #endif #ifndef DEBUG #ifdef _CRAY limit(C_JOB, 0, L_CPROC, 0); limit(C_JOB, 0, L_CPU, 0); limit(C_JOBPROCS, 0, L_CPU, 0); limit(C_PROC, 0, L_FD, 255); limit(C_JOB, 0, L_FSBLK, 0); limit(C_JOBPROCS, 0, L_FSBLK, 0); limit(C_JOB, 0, L_MEM , 0); limit(C_JOBPROCS, 0, L_MEM , 0); #else /* not _CRAY */ rlimit.rlim_cur = RLIM_INFINITY; rlimit.rlim_max = RLIM_INFINITY; setrlimit(RLIMIT_CPU, &rlimit); setrlimit(RLIMIT_FSIZE, &rlimit); setrlimit(RLIMIT_DATA, &rlimit); setrlimit(RLIMIT_STACK, &rlimit); #ifdef RLIMIT_RSS setrlimit(RLIMIT_RSS, &rlimit); #endif /* RLIMIT_RSS */ #ifdef RLIMIT_VMEM setrlimit(RLIMIT_VMEM, &rlimit); #endif /* RLIMIT_VMEM */ #endif /* not _CRAY */ #endif /* DEBUG */ return(PBSE_NONE); } /* END setup_limits() */ int setup_signal_handling() { struct sigaction act; struct sigaction oact; sigemptyset(&act.sa_mask); act.sa_flags = 0; act.sa_handler = change_logs_handler; if (sigaction(SIGHUP, &act, &oact) != 0) { log_err(errno, __func__, "sigaction for HUP"); return(2); } act.sa_handler = stop_me; if (sigaction(SIGINT, &act, &oact) != 0) { log_err(errno, __func__, "sigaction for INT"); return(2); } if (sigaction(SIGTERM, &act, &oact) != 0) { log_err(errno, __func__, "sigactin for TERM"); return(2); } #ifdef NDEBUG if (sigaction(SIGQUIT, &act, &oact) != 0) { log_err(errno, __func__, "sigactin for QUIT"); return(2); } #endif /* NDEBUG */ #ifdef SIGSHUTDN if (sigaction(SIGSHUTDN, &act, &oact) != 0) { log_err(errno, __func__, "sigactin for SHUTDN"); return(2); } #endif /* SIGSHUTDN */ /* * Catch these signals to ensure we core dump even if * our rlimit for core dumps is set to 0 initially. * * Chris Samuel - VPAC * csamuel@vpac.org - 29th July 2003 * * Now conditional on PBSCOREDUMP environment variable. * 13th August 2003. */ if (getenv("PBSCOREDUMP")) { act.sa_handler = catch_abort; /* make sure we core dump */ sigaction(SIGSEGV, &act, NULL); sigaction(SIGBUS, &act, NULL); sigaction(SIGFPE, &act, NULL); sigaction(SIGILL, &act, NULL); sigaction(SIGTRAP, &act, NULL); sigaction(SIGSYS, &act, NULL); } act.sa_handler = SIG_DFL; if (sigaction(SIGCHLD, &act, &oact) != 0) { log_err(errno, __func__, "sigaction for CHLD"); return(2); } act.sa_handler = SIG_IGN; if (sigaction(SIGPIPE, &act, &oact) != 0) { log_err(errno, __func__, "sigaction for PIPE"); return(2); } if (sigaction(SIGCHLD, &act, &oact) != 0) { log_err(errno, __func__, "sigaction for SIGCHLD"); return(2); } act.sa_handler = change_log_level; if (sigaction(SIGUSR1, &act, &oact) != 0) { log_err(errno, __func__, "sigaction for USR1"); return(2); } if (sigaction(SIGUSR2, &act, &oact) != 0) { log_err(errno, __func__, "sigaction for USR2"); return(2); } return(PBSE_NONE); } /* END setup_signal_handling() */ int initialize_paths() { int rc = PBSE_NONE; const char *suffix_slash = "/"; const char *new_tag = ".new"; struct stat statbuf; char log_buf[LOCAL_LOG_BUF_SIZE]; #if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) char EMsg[1024]; #endif /* not DEBUG and not NO_SECURITY_CHECK */ if (path_priv == NULL) path_priv = build_path(path_home, PBS_SVR_PRIVATE, suffix_slash); path_arrays = build_path(path_priv, PBS_ARRAYDIR, suffix_slash); path_spool = build_path(path_home, PBS_SPOOLDIR, suffix_slash); path_queues = build_path(path_priv, PBS_QUEDIR, suffix_slash); path_jobs = build_path(path_priv, PBS_JOBDIR, suffix_slash); path_credentials = build_path(path_priv, PBS_CREDENTIALDIR, suffix_slash); path_acct = build_path(path_priv, PBS_ACCT, suffix_slash); if (path_svrdb == NULL) path_svrdb = build_path(path_priv, PBS_SERVERDB, NULL); path_svrdb_new = build_path(path_priv, PBS_SERVERDB, new_tag); path_svrlog = build_path(path_home, PBS_LOGFILES, suffix_slash); path_jobinfo_log = build_path(path_home, PBS_JOBINFOLOGDIR, suffix_slash); path_track = build_path(path_priv, PBS_TRACKING, NULL); path_nodes = build_path(path_priv, NODE_DESCRIP, NULL); path_nodes_new = build_path(path_priv, NODE_DESCRIP, new_tag); path_nodestate = build_path(path_priv, NODE_STATUS, NULL); path_nodenote = build_path(path_priv, NODE_NOTE, NULL); path_nodenote_new = build_path(path_priv, NODE_NOTE, new_tag); path_mom_hierarchy = build_path(path_priv, PBS_MOM_HIERARCHY, NULL); #ifdef SERVER_CHKPTDIR /* need to make sure path ends with a '/' */ if (*(SERVER_CHKPTDIR + strlen(SERVER_CHKPTDIR) - 1) == '/') { path_checkpoint = strdup(SERVER_CHKPTDIR); } else { int len = strlen(SERVER_CHKPTDIR) + strlen(suffix_slash) + 1; path_checkpoint = (char *) calloc(1, len); snprintf(path_checkpoint, len, "%s%s", SERVER_CHKPTDIR, suffix_slash); } #else path_checkpoint = build_path(path_home, PBS_CHKPTDIR, suffix_slash); #endif /* check existance amd make sure it is a directory */ if (stat(path_checkpoint, &statbuf) < 0) { sprintf(log_buf, "unable to stat checkpoint directory %s, errno %d (%s)", path_checkpoint, errno, strerror(errno)); log_err(errno, "pbs_init", log_buf); return(-1); } if (!S_ISDIR(statbuf.st_mode)) { sprintf(log_buf, "checkpoint directory path %s is not a directory", path_checkpoint); log_err(errno, "pbs_init", log_buf); return(-1); } #ifdef SERVER_CHKPTDIR /* set permissions on checkpoint path, if needed */ if ((statbuf.st_mode && 01777) != 01777) { if (chmod(path_checkpoint, 01777) != 0) { log_err(errno, __func__, "can't chmod 01777 checkpoint"); } } #endif if (svr_resc_def == NULL) { if ((rc = init_resc_defs()) != PBSE_NONE) { log_err(rc, __func__, msg_init_baddb); return(-1); } } #if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) rc = chk_file_sec(path_jobs, 1, 0, S_IWGRP | S_IWOTH, 1, EMsg); rc |= chk_file_sec(path_queues, 1, 0, S_IWGRP | S_IWOTH, 0, EMsg); rc |= chk_file_sec(path_spool, 1, 1, S_IWOTH, 0, EMsg); rc |= chk_file_sec(path_acct, 1, 0, S_IWGRP | S_IWOTH, 0, EMsg); rc |= chk_file_sec(path_credentials, 1, 0, S_IWGRP | S_IWOTH, 0, EMsg); rc |= chk_file_sec((char *)PBS_ENVIRON, 0, 0, S_IWGRP | S_IWOTH, 1, EMsg); if (rc != PBSE_NONE) { return(3); } #endif /* not DEBUG and not NO_SECURITY_CHECK */ return(rc); } /* END initialize_paths() */ int initialize_data_structures_and_mutexes() { long cray_enabled = FALSE; svr_do_schedule_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(svr_do_schedule_mutex, NULL); check_tasks_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(check_tasks_mutex, NULL); listener_command_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(listener_command_mutex, NULL); node_state_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(node_state_mutex, NULL); scheduler_sock_jobct_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(scheduler_sock_jobct_mutex, NULL); reroute_job_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(reroute_job_mutex, NULL); pthread_mutex_lock(scheduler_sock_jobct_mutex); scheduler_sock = -1; scheduler_jobct = 0; pthread_mutex_unlock(scheduler_sock_jobct_mutex); /* make the task list child and events mutexes recursive because * they can be called by a signal handler */ initialize_recycler(); initialize_batch_request_holder(); task_list_timed = new std::list(); pthread_mutex_init(&task_list_timed_mutex, NULL); initialize_all_jobs_array(&alljobs); initialize_all_jobs_array(&array_summary); initialize_all_jobs_array(&newjobs); initialize_hello_container(&hellos); initialize_hello_container(&failures); initialize_task_recycler(); initialize_queue_recycler(); initialize_user_info_holder(&users); CLEAR_HEAD(svr_newnodes); initialize_all_arrays_array(); initialize_allques_array(&svr_queues); exiting_jobs_info = get_hash_map(-1); get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if (cray_enabled == TRUE) { initialize_login_holder(); initialize_alps_reservations(); } acctfile_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t)); pthread_mutex_init(acctfile_mutex, NULL); return(PBSE_NONE); } /* END initialize_data_structures_and_mutexes() */ int setup_server_attrs( int type) { int i; int rc = PBSE_NONE; pthread_mutex_lock(server.sv_attr_mutex); for (i = 0; i < SRV_ATR_LAST; i++) clear_attr(&server.sv_attr[i], &svr_attr_def[i]); server.sv_attr[SRV_ATR_scheduler_iteration].at_val.at_long = PBS_SCHEDULE_CYCLE; server.sv_attr[SRV_ATR_scheduler_iteration].at_flags = ATR_VFLAG_SET; server.sv_attr[SRV_ATR_State].at_val.at_long = SV_STATE_INIT; server.sv_attr[SRV_ATR_State].at_flags = ATR_VFLAG_SET; svr_attr_def[SRV_ATR_mailfrom].at_decode( &server.sv_attr[SRV_ATR_mailfrom], 0, 0, (char *)PBS_DEFAULT_MAIL, 0); server.sv_attr[SRV_ATR_tcp_timeout].at_val.at_long = PBS_TCPTIMEOUT; server.sv_attr[SRV_ATR_tcp_timeout].at_flags = ATR_VFLAG_SET; server.sv_attr[SRV_ATR_check_rate].at_val.at_long = PBS_NORMAL_PING_RATE / 2; server.sv_attr[SRV_ATR_check_rate].at_flags = ATR_VFLAG_SET; server.sv_attr[SRV_ATR_JobStatRate].at_val.at_long = PBS_RESTAT_JOB; server.sv_attr[SRV_ATR_JobStatRate].at_flags = ATR_VFLAG_SET; server.sv_attr[SRV_ATR_PollJobs].at_val.at_long = PBS_POLLJOBS; server.sv_attr[SRV_ATR_PollJobs].at_flags = ATR_VFLAG_SET; server.sv_attr[SRV_ATR_MomJobSync].at_flags = ATR_VFLAG_SET; server.sv_attr[SRV_ATR_MomJobSync].at_val.at_long = 1; server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long = TRUE; server.sv_attr[SRV_ATR_MoabArrayCompatible].at_flags = ATR_VFLAG_SET; /* force logging of all types */ server.sv_attr[SRV_ATR_log_events].at_val.at_long = PBSEVENT_MASK; server.sv_attr[SRV_ATR_log_events].at_flags = ATR_VFLAG_SET; server.sv_attr[SRV_ATR_nppcu].at_val.at_long = APBASIL_DEFAULT_NPPCU_VALUE; server.sv_attr[SRV_ATR_nppcu].at_flags = ATR_VFLAG_SET; /* If not a "create" initialization, recover server db */ rc = chk_save_file(path_svrdb); if (type != RECOV_CREATE) { /* Open the server database (save file) and read it in */ if ((rc != PBSE_NONE) || ((rc = svr_recov_xml(path_svrdb, FALSE)) == -1)) { log_err(rc, __func__, msg_init_baddb); return(-1); } if (server.sv_attr[SRV_ATR_resource_assn].at_flags & ATR_VFLAG_SET) { svr_attr_def[SRV_ATR_resource_assn].at_free( &server.sv_attr[SRV_ATR_resource_assn]); } } else { rm_files(path_priv); pthread_mutex_unlock(server.sv_attr_mutex); svr_save(&server, SVR_SAVE_FULL); pthread_mutex_lock(server.sv_attr_mutex); } rc = PBSE_NONE; svr_attr_def[SRV_ATR_version].at_decode( &server.sv_attr[SRV_ATR_version], 0, 0, (char *)PACKAGE_VERSION, 0); /* open accounting file and job log file if logging is set */ if (acct_open(acct_file) != 0) { pthread_mutex_unlock(server.sv_attr_mutex); return(-1); } if (server.sv_attr[SRV_ATR_RecordJobInfo].at_val.at_long) { rc = job_log_open(job_log_file, path_jobinfo_log); if (rc != PBSE_NONE) { fprintf(stderr, "Could not open job_logs \n"); pthread_mutex_unlock(server.sv_attr_mutex); return(-1); } } /* set up other server and global variables */ if (a_opt_init != -1) { /* a_option was set, overrides saved value of scheduling attr */ server.sv_attr[SRV_ATR_scheduling].at_val.at_long = a_opt_init; server.sv_attr[SRV_ATR_scheduling].at_flags |= ATR_VFLAG_SET; } pthread_mutex_unlock(server.sv_attr_mutex); return(rc); } /* END setup_server_attrs() */ int initialize_nodes() { initialize_all_nodes_array(&allnodes); if (setup_nodes() == -1) { return(-1); } add_server_names_to_acl_hosts(); update_default_np(); return(PBSE_NONE); } /* END initialize_nodes() */ int handle_queue_recovery( int type) { int rc = PBSE_NONE; struct dirent *pdirent; DIR *dir; int had; pbs_queue *pque = NULL; char log_buf[LOCAL_LOG_BUF_SIZE]; int logtype; if (chdir(path_queues) != 0) { sprintf(log_buf, msg_init_chdir, path_queues); log_err(errno, __func__, log_buf); return(-1); } had = server.sv_qs.sv_numque; server.sv_qs.sv_numque = 0; dir = opendir("."); if (dir == NULL) { log_err(-1, __func__, msg_init_noqueues); sprintf(log_buf, "%s:1", __func__); unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf); return(-1); } while ((pdirent = readdir(dir)) != NULL) { if (pdirent->d_name[0] == '\0') { /* invalid name returned */ continue; } if (chk_save_file(pdirent->d_name) == 0) { /* recover queue */ if ((pque = que_recov_xml(pdirent->d_name)) != NULL) { /* que_recov increments sv_numque */ sprintf(log_buf, msg_init_recovque, pque->qu_qs.qu_name); log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); if (pque->qu_attr[QE_ATR_ResourceAssn].at_flags & ATR_VFLAG_SET) { que_attr_def[QE_ATR_ResourceAssn].at_free(&pque->qu_attr[QE_ATR_ResourceAssn]); } unlock_queue(pque, __func__, NULL, LOGLEVEL); } } } closedir(dir); if ((had != server.sv_qs.sv_numque) && (type != RECOV_CREATE)) logtype = PBSEVENT_ERROR | PBSEVENT_SYSTEM; else logtype = PBSEVENT_SYSTEM; sprintf(log_buf, msg_init_expctq, had, server.sv_qs.sv_numque); log_event(logtype, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); return(rc); } /* END handle_queue_recovery() */ void mark_as_badjob( const char *filename) { char basen[MAXPATHLEN+1]; snprintf(basen, sizeof(basen), "%s%s", filename, JOB_BAD_SUFFIX); if (link(filename, basen) < 0) { log_err(errno, __func__, "failed to link corrupt .JB file to .BD"); } else { unlink(filename); } } int handle_array_recovery( int type) { char log_buf[LOCAL_LOG_BUF_SIZE]; struct dirent *pdirent; DIR *dir; int rc = PBSE_NONE; int rc2 = PBSE_NONE; job_array *pa = NULL; int baselen = 0; int array_suf_len = strlen(ARRAY_FILE_SUFFIX); char *psuffix; if (chdir(path_arrays) != 0) { sprintf(log_buf, msg_init_chdir, path_arrays); log_err(errno, __func__, log_buf); sprintf(log_buf, "%s:2", __func__); unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf); return(-1); } if((dir = opendir(".")) == NULL) return -1; while ((pdirent = readdir(dir)) != NULL) { if (chk_save_file(pdirent->d_name) == PBSE_NONE) { /* if not create or clean recovery, recover arrays */ if ((type != RECOV_CREATE) && (type != RECOV_COLD)) { /* skip files without the proper suffix */ baselen = strlen(pdirent->d_name) - array_suf_len; psuffix = pdirent->d_name + baselen; if (strcmp(psuffix, ARRAY_FILE_SUFFIX)) continue; if ((rc = array_recov(pdirent->d_name, &pa)) != PBSE_NONE) { sprintf(log_buf, "could not recover array-struct from file %s--skipping. job array can not be recovered.", pdirent->d_name); log_err(errno, __func__, log_buf); sprintf(log_buf, "%s:3", __func__); unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf); mark_as_badjob(pdirent->d_name); rc2 = rc; /* rc2 captures the latest error in rc */ } else { pa->jobs_recovered = 0; unlock_ai_mutex(pa, __func__, "2", LOGLEVEL); } } else { unlink(pdirent->d_name); } } } closedir(dir); if (rc2 != PBSE_NONE) rc = rc2; return(rc); } /* handle_array_recovery() */ int handle_job_recovery( int type) { char log_buf[LOCAL_LOG_BUF_SIZE]; struct dirent *pdirent; DIR *dir; int had; int rc = PBSE_NONE; int job_rc = PBSE_NONE; job *pjob; int logtype; int baselen = 0; char *psuffix; int job_count = 0; /* Count of recovered jobs */ const char *job_suffix = JOB_FILE_SUFFIX; int job_suf_len = strlen(job_suffix); char basen[MAXPATHLEN+1]; int Index; int iter = -1; time_t time_now = time(NULL); if (chdir(path_jobs) != 0) { sprintf(log_buf, msg_init_chdir, path_jobs); log_err(errno, __func__, log_buf); sprintf(log_buf, "%s:1", __func__); unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf); return(-1); } had = server.sv_qs.sv_numjobs; server.sv_qs.sv_numjobs = 0; sprintf(log_buf, "%s:2", __func__); unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf); dir = opendir("."); if (dir == NULL) { if ((type != RECOV_CREATE) && (type != RECOV_COLD)) { if (had == 0) { log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, msg_init_nojobs); } else { sprintf(log_buf, msg_init_exptjobs, had, 0); log_err(-1, __func__, log_buf); } } } else { darray_t Array; DArrayInit(&Array,100); /* Now, for each job found ... */ while ((pdirent = readdir(dir)) != NULL) { job_count++; if ((job_count % 1000) == 0) { snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "%d files read from disk", job_count); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); } if (chk_save_file(pdirent->d_name) == 0) { /* recover the jobs */ baselen = strlen(pdirent->d_name) - job_suf_len; psuffix = pdirent->d_name + baselen; if (!strcmp(psuffix, ".TA")) { if ((pjob = job_recov(pdirent->d_name)) != NULL) { pjob->ji_is_array_template = TRUE; if (DArrayAppend(&Array,pjob) == FAILURE) { log_err(ENOMEM,"main", (char *)"out of memory reloading jobs"); exit(-1); } if (type == RECOV_COLD) pjob->ji_cold_restart = TRUE; unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); } continue; } if (strcmp(psuffix, job_suffix)) continue; if ((pjob = job_recov(pdirent->d_name)) != NULL) { if (DArrayAppend(&Array,pjob) == FAILURE) { log_err(ENOMEM, "main", (char *)"out of memory reloading jobs"); exit(-1); } if (type == RECOV_COLD) pjob->ji_cold_restart = TRUE; unlock_ji_mutex(pjob, __func__, "2", LOGLEVEL); } else { sprintf(log_buf, msg_init_badjob, pdirent->d_name); log_err(-1, __func__, log_buf); /* remove corrupt job */ snprintf(basen, sizeof(basen), "%s%s", pdirent->d_name, JOB_BAD_SUFFIX); if (link(pdirent->d_name, basen) < 0) { log_err(errno, __func__, "failed to link corrupt .JB file to .BD"); } else { unlink(pdirent->d_name); } } } } /* END while ((pdirent = readdir(dir)) != NULL) */ snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "%d total files read from disk", job_count); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); closedir(dir); qsort(Array.Data, Array.AppendIndex, sizeof(Array.Data[0]), SortPrioAscend); for (Index = 0; Index < Array.AppendIndex; Index++) { job *pjob = (job *)Array.Data[Index]; lock_ji_mutex(pjob, __func__, NULL, LOGLEVEL); job_rc = pbsd_init_job(pjob, type); if (job_rc != PBSE_NONE) { log_event( PBSEVENT_ERROR | PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_JOB | PBSEVENT_FORCE, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, msg_script_open); unlock_ji_mutex(pjob, __func__, "4", LOGLEVEL); continue; } if ((type != RECOV_COLD) && (type != RECOV_CREATE) && (pjob->ji_arraystructid[0] == '\0') && (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT)) { snprintf(basen, sizeof(basen), "%s%s", pjob->ji_qs.ji_fileprefix, JOB_SCRIPT_SUFFIX); if (chk_save_file(basen) != 0) { log_event( PBSEVENT_ERROR | PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_JOB | PBSEVENT_FORCE, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, msg_script_open); init_abt_job(pjob); } else { /* set up the poll_task for this recovered job - * only do up to 10 per second to not overwhelm pbs_server*/ if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)) { set_task(WORK_Timed, time_now + 10 + (Index % 10), poll_job_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } unlock_ji_mutex(pjob, __func__, "5", LOGLEVEL); } } else { /* set up the poll_task for this recovered job - * only do up to 10 per second to not overwhelm pbs_server*/ if ((pjob->ji_qs.ji_state == JOB_STATE_RUNNING) && (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str != NULL)) { set_task(WORK_Timed, time_now + 10 + (Index % 10), poll_job_task, strdup(pjob->ji_qs.ji_jobid), FALSE); } unlock_ji_mutex(pjob, __func__, "6", LOGLEVEL); } } DArrayFree(&Array); sprintf(log_buf, "%s:1", __func__); lock_sv_qs_mutex(server.sv_qs_mutex, log_buf); if ((had != server.sv_qs.sv_numjobs) && (type != RECOV_CREATE) && (type != RECOV_COLD)) { logtype = PBSEVENT_ERROR | PBSEVENT_SYSTEM; } else { logtype = PBSEVENT_SYSTEM; } sprintf(log_buf, msg_init_exptjobs, had, server.sv_qs.sv_numjobs); sprintf(log_buf, "%s:3", __func__); unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf); log_event(logtype,PBS_EVENTCLASS_SERVER,msg_daemonname,log_buf); } /* END else */ /* If queue_rank has gone negative, renumber all jobs and reset rank */ if (queue_rank < 0) { iter = -1; queue_rank = 0; while ((pjob = next_job(&alljobs, &iter)) != NULL) { pjob->ji_wattr[JOB_ATR_qrank].at_val.at_long = ++queue_rank; job_save(pjob, SAVEJOB_FULL, 0); unlock_ji_mutex(pjob, __func__, "7", LOGLEVEL); } } return(rc); } /* END handle_job_recovery() */ int cleanup_recovered_arrays() { job_array *pa; job *pjob; char arrayid[PBS_MAXSVRJOBID+1]; int iter = -1; int rc = PBSE_NONE; while ((pa = next_array(&iter)) != NULL) { int job_template_exists = FALSE; if ((pjob = svr_find_job(pa->ai_qs.parent_id, FALSE)) != NULL) { mutex_mgr job_mgr(pjob->ji_mutex,true); job_template_exists = TRUE; } /* if no jobs were recovered, delete this array */ if (pa->jobs_recovered == 0) { if ((pjob = svr_find_job(pa->ai_qs.parent_id, FALSE)) != NULL) svr_job_purge(pjob); array_delete(pa); /* move on to the next array */ continue; } strcpy(arrayid, pa->ai_qs.parent_id); /* see if we need to upgrade the array version. */ /* We will upgrade from version 3 or later */ if (pa->ai_qs.struct_version == 3) { pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION; pa->ai_qs.num_purged = pa->ai_qs.num_jobs - pa->jobs_recovered; array_save(pa); } if (pa->ai_qs.num_cloned != pa->ai_qs.num_jobs) { /* if we can't finish building the job array then delete whats been done so far */ if (job_template_exists == FALSE) { int i; for (i = 0; i < pa->ai_qs.array_size; i++) { if (pa->job_ids[i] != NULL) { if ((pjob = svr_find_job(pa->job_ids[i], FALSE)) != NULL) { unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); svr_job_purge(pjob); pa = get_array(arrayid); } } } array_delete(pa); continue; } else { /* TODO Someone must have been naughty and did a kill -9 on pbs_server, we might need to validate that the last job was fully initialized before continuing the cloning process. */ enqueue_threadpool_request(job_clone_wt, strdup(pa->ai_qs.parent_id)); } } else if ((pa->ai_qs.jobs_done == pa->ai_qs.num_jobs) && (job_template_exists == FALSE)) { array_delete(pa); continue; } unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); } /* END for each array */ return(rc); } /* END cleanup_recovered_arrays() */ int handle_job_and_array_recovery( int type) { int rc; if ((rc = handle_array_recovery(type)) != PBSE_NONE) return(rc); else if ((rc = handle_job_recovery(type)) != PBSE_NONE) return(rc); else rc = cleanup_recovered_arrays(); return(rc); } /* END handle_job_and_array_recovery() */ int handle_tracking_records() { int fd; int rc = PBSE_NONE; int i; struct stat statbuf; #if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) char EMsg[1024]; #endif /* not DEBUG and not NO_SECURITY_CHECK */ if ((fd = open(path_track, O_RDONLY | O_CREAT, 0600)) < 0) { log_err(errno, __func__, "unable to open tracking file"); return(-1); } #if !defined(DEBUG) && !defined(NO_SECURITY_CHECK) if (chk_file_sec(path_track, 0, 0, S_IWGRP | S_IWOTH, 0, EMsg) != 0) { close(fd); return(-1); } #endif /* not DEBUG and not NO_SECURITY_CHECK */ if (fstat(fd, &statbuf) < 0) { log_err(errno, "pbs_init", (char *)"unable to stat tracking file"); close(fd); return(-1); } server.sv_tracksize = (statbuf.st_size + sizeof(struct tracking) - 1) / sizeof(struct tracking); if (server.sv_tracksize < PBS_TRACK_MINSIZE) server.sv_tracksize = PBS_TRACK_MINSIZE; if ((server.sv_track = (struct tracking *)calloc(server.sv_tracksize, sizeof(struct tracking))) == NULL) { /* FAILURE - cannot alloc memory */ log_err(errno, "pbs_init", (char *)"calloc failure"); close(fd); return(-1); } for (i = 0; i < server.sv_tracksize; i++) (server.sv_track + i)->tk_mtime = 0; /* NOTE: tracking file records are optional */ if (read_ac_socket(fd, (char *)server.sv_track, server.sv_tracksize * sizeof(struct tracking)) < 0) { log_err(errno, "pbs_init", "unable to read tracksize from tracking file"); } close(fd); server.sv_trackmodifed = 0; /* set work task to periodically save the tracking records */ set_task(WORK_Timed, (long)(time(NULL) + PBS_SAVE_TRACK_TM), track_save, (char *)NULL, FALSE); return(rc); } /* END handle_tracking_records() */ void setup_threadpool() { long min_threads; long max_threads; long thread_idle_time = DEFAULT_THREAD_IDLE; min_threads = get_default_threads(); max_threads = min_threads * 10; /* setup the threadpool for use */ get_svr_attr_l(SRV_ATR_minthreads, &min_threads); get_svr_attr_l(SRV_ATR_maxthreads, &max_threads); get_svr_attr_l(SRV_ATR_threadidleseconds, &thread_idle_time); initialize_threadpool(&request_pool, min_threads, max_threads, thread_idle_time); } /* END setup_threadpool() */ /* * This file contains the functions to initialize the PBS Batch Server. * The code is called once when the server is brought up. */ int pbsd_init( int type) /* type of initialization */ { int ret = PBSE_NONE; gid_t gid; char log_buf[LOCAL_LOG_BUF_SIZE]; memset(&hints, 0, sizeof(hints)); hints.ai_flags = AI_CANONNAME; /* The following is code to reduce security risks */ if (setup_env(PBS_ENVIRON) == -1) { return(-1); } gid = getgid(); /* secure suppl. groups */ if (setgroups(1, &gid) != 0) { snprintf(log_buf, sizeof(log_buf), "Unable to drop secondary groups. Some MAC framework is active?\n"); log_err(errno, __func__, log_buf); snprintf(log_buf, sizeof(log_buf), "setgroups(group = %lu) failed: %s\n", (unsigned long)gid, strerror(errno)); log_err(errno, __func__, log_buf); return(-1); } setup_threadpool(); setup_limits(); /* 1. set up to catch or ignore various signals */ if ((ret = setup_signal_handling()) != PBSE_NONE) return(ret); /* 2. set up the various paths and other global variables we need */ if ((ret = initialize_paths()) != PBSE_NONE) return(ret); initialize_data_structures_and_mutexes(); /* 3. Set default server attibutes values */ if ((ret = setup_server_attrs(type)) != PBSE_NONE) return(ret); /* Open and read in node list if one exists */ if ((ret = initialize_nodes()) != PBSE_NONE) return(ret); /* the functions we're calling assume this mutex is locked */ sprintf(log_buf, "%s:1", __func__); lock_sv_qs_mutex(server.sv_qs_mutex, log_buf); if ((ret = handle_queue_recovery(type)) != PBSE_NONE) return(ret); handle_job_and_array_recovery(type); /* Put us back in the Server's Private directory */ if (chdir(path_priv) != 0) { sprintf(log_buf, msg_init_chdir, path_priv); log_err(-1, __func__, log_buf); return(3); } handle_tracking_records(); /* read the hierarchy file */ if ((hierarchy_holder = prepare_mom_hierarchy()) == NULL) { /* hierarchy file exists but we couldn't open it */ return(-1); } /* mark all nodes as needing a hello */ if (auto_send_hierarchy == true) add_all_nodes_to_hello_container(); /* allow the threadpool to start processing */ start_request_pool(); /* SUCCESS */ return(PBSE_NONE); } /* END pbsd_init() */ /* * build_path - build the pathname for a PBS directory */ char *build_path( char *parent, /* parent directory name (dirname) */ const char *name, /* sub directory name */ const char *suffix) /* suffix string to append */ { int prefixslash; char *ppath; size_t len; /* * allocate space for the names + maybe a slash between + the suffix */ if (*(parent + strlen(parent) - 1) == '/') prefixslash = 0; else prefixslash = 1; len = strlen(parent) + strlen(name) + prefixslash + 1; if (suffix != NULL) len += strlen(suffix); ppath = (char *)calloc(1, PATH_MAX); if (ppath != NULL) { strcat(ppath, parent); if (prefixslash) strcat(ppath, "/"); strcat(ppath, name); if (suffix) strcat(ppath, suffix); return(ppath); } log_err(errno, "build_path", msg_err_malloc); pthread_mutex_lock(&log_mutex); log_close(1); pthread_mutex_unlock(&log_mutex); exit(3); } /* END build_path() */ /* * pbsd_init_job - decide what to do with the recovered job structure * * The action depends on the type of initialization. */ int pbsd_init_job( job *pjob, /* I */ int type) /* I */ { unsigned int d; int rc = PBSE_NONE; time_t time_now = time(NULL); char log_buf[LOCAL_LOG_BUF_SIZE]; int local_errno = 0; char job_id[PBS_MAXSVRJOBID+1]; long job_atr_hold; int job_exit_status; long cray_enabled = FALSE; pjob->ji_momhandle = -1; /* update at_server pbs_attribute in case name changed */ job_attr_def[JOB_ATR_at_server].at_free( &pjob->ji_wattr[JOB_ATR_at_server]); job_attr_def[JOB_ATR_at_server].at_decode( &pjob->ji_wattr[JOB_ATR_at_server], NULL, NULL, server_name, 0); /* update queue_rank if this job is higher than current */ if ((unsigned long)pjob->ji_wattr[JOB_ATR_qrank].at_val.at_long > (unsigned long)queue_rank) queue_rank = pjob->ji_wattr[JOB_ATR_qrank].at_val.at_long; /* now based on the initialization type */ if ((type == RECOV_COLD) || (type == RECOV_CREATE)) { init_abt_job(pjob); return(PBSE_BAD_PARAMETER); } if (type != RECOV_HOT) pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART; switch (pjob->ji_qs.ji_substate) { case JOB_SUBSTATE_TRANSICM: if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) { /* * This server created the job, so client * was qsub (a transient client), it won't be * around to recommit, so auto-commit now */ pjob->ji_qs.ji_state = JOB_STATE_QUEUED; pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED; rc = pbsd_init_reque(pjob, CHANGE_STATE); } else { /* * another server is sending, append to new job * list and wait for commit; need to clear * receiving socket number though */ pjob->ji_qs.ji_un.ji_newt.ji_fromsock = -1; insert_job(&newjobs,pjob); } break; case JOB_SUBSTATE_TRNOUT: pjob->ji_qs.ji_state = JOB_STATE_QUEUED; pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED; /* requeue as queued */ rc = pbsd_init_reque(pjob, CHANGE_STATE); break; case JOB_SUBSTATE_TRNOUTCM: /* requeue as is - rdy to cmt */ rc = pbsd_init_reque(pjob, KEEP_STATE); /* resend rtc */ set_task(WORK_Immed, 0, resume_net_move, strdup(pjob->ji_qs.ji_jobid), FALSE); break; case JOB_SUBSTATE_QUEUED: case JOB_SUBSTATE_PRESTAGEIN: case JOB_SUBSTATE_STAGEIN: case JOB_SUBSTATE_STAGECMP: case JOB_SUBSTATE_STAGEFAIL: case JOB_SUBSTATE_STAGEGO: case JOB_SUBSTATE_CHKPTGO: case JOB_SUBSTATE_CHKPTCMP: case JOB_SUBSTATE_HELD: case JOB_SUBSTATE_SYNCHOLD: case JOB_SUBSTATE_DEPNHOLD: case JOB_SUBSTATE_WAITING: case JOB_SUBSTATE_PRERUN: case JOB_SUBSTATE_ARRAY_TEMP: rc = pbsd_init_reque(pjob, CHANGE_STATE); break; case JOB_SUBSTATE_RUNNING: if (pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str == NULL) { /* corrupt job file. job is in running state without exec_host list */ pjob->ji_wattr[JOB_ATR_hold].at_val.at_long |= HOLD_s; pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET; pjob->ji_qs.ji_state = JOB_STATE_QUEUED; pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED; snprintf(log_buf, sizeof(log_buf), "Job %s file says it is in a running state but has no exec host list, adding system hold", pjob->ji_qs.ji_jobid); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); if (pjob->ji_wattr[JOB_ATR_Comment].at_val.at_str != NULL) { free(pjob->ji_wattr[JOB_ATR_Comment].at_val.at_str); pjob->ji_wattr[JOB_ATR_Comment].at_val.at_str = strdup(log_buf); pjob->ji_wattr[JOB_ATR_Comment].at_flags |= ATR_VFLAG_SET; } rc = pbsd_init_reque(pjob, CHANGE_STATE); } else { rc = pbsd_init_reque(pjob, KEEP_STATE); pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_RescAssn; set_resc_assigned(pjob, INCR); /* suspended jobs don't get reassigned to nodes */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0) { set_old_nodes(pjob); } if (type == RECOV_HOT) pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART; } break; case JOB_SUBSTATE_SYNCRES: /* clear all dependent job ready flags */ depend_clrrdy(pjob); rc = pbsd_init_reque(pjob, CHANGE_STATE); break; case JOB_SUBSTATE_EXITING: case JOB_SUBSTATE_STAGEOUT: case JOB_SUBSTATE_STAGEDEL: case JOB_SUBSTATE_EXITED: case JOB_SUBSTATE_ABORT: /* This is delayed because it is highly likely MS is "state-unknown" * at this time, and there's no real hurry anyways. */ apply_job_delete_nanny(pjob, time_now + 60); set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); rc = pbsd_init_reque(pjob, KEEP_STATE); break; case JOB_SUBSTATE_COMPLETE: /* Completed jobs are no longer purged on startup */ set_task(WORK_Immed, 0, on_job_exit_task, strdup(pjob->ji_qs.ji_jobid), FALSE); rc = pbsd_init_reque(pjob, KEEP_STATE); /* do array bookeeping */ if ((pjob->ji_arraystructid[0] != '\0') && (pjob->ji_is_array_template == FALSE)) { job_array *pa = get_jobs_array(&pjob); if (pjob != NULL) { strcpy(job_id, pjob->ji_qs.ji_jobid); job_atr_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long; job_exit_status = pjob->ji_qs.ji_un.ji_exect.ji_exitstat; unlock_ji_mutex(pjob, __func__, "1", LOGLEVEL); update_array_values(pa,JOB_STATE_RUNNING,aeTerminate, job_id, job_atr_hold, job_exit_status); unlock_ai_mutex(pa, __func__, "1", LOGLEVEL); pjob = svr_find_job(job_id, FALSE); } } break; case JOB_SUBSTATE_RERUN: if (pjob->ji_qs.ji_state == JOB_STATE_EXITING) set_task(WORK_Immed, 0, on_job_rerun_task, strdup(pjob->ji_qs.ji_jobid), FALSE); rc = pbsd_init_reque(pjob, KEEP_STATE); break; case JOB_SUBSTATE_RERUN1: case JOB_SUBSTATE_RERUN2: set_task(WORK_Immed, 0, on_job_rerun_task, strdup(pjob->ji_qs.ji_jobid), FALSE); rc = pbsd_init_reque(pjob, KEEP_STATE); break; default: sprintf(log_buf, msg_init_unkstate, pjob->ji_qs.ji_substate); log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf); job_abt(&pjob, log_buf); /* pjob is not freed */ if (pjob == NULL) { return(PBSE_JOBSUBSTATE); } break; } /* END switch (pjob->ji_qs.ji_substate) */ /* if job has IP address of Mom, it may have changed */ /* reset based on hostname */ if (pjob != NULL) { if ((pjob->ji_qs.ji_un_type == JOB_UNION_TYPE_EXEC) && (pjob->ji_qs.ji_un.ji_exect.ji_momaddr != 0)) { if (pjob->ji_wattr[JOB_ATR_exec_host].at_flags & ATR_VFLAG_SET) { char *tmp; get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled); if ((cray_enabled == TRUE) && (pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str != NULL)) { tmp = parse_servername(pjob->ji_wattr[JOB_ATR_login_node_id].at_val.at_str, &d); } else tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &d); pjob->ji_qs.ji_un.ji_exect.ji_momaddr = get_hostaddr(&local_errno, tmp); free(tmp); } else { pjob->ji_qs.ji_un.ji_exect.ji_momaddr = 0; } } } return(rc); } /* END pbsd_init_job() */ int pbsd_init_reque( job *pjob, /* I (modified/possibly freed) */ int change_state) /* I */ { int newstate; int newsubstate; int rc; char log_buf[LOCAL_LOG_BUF_SIZE]; /* re-enqueue the job into the queue it was in */ if (change_state) { /* update the state, typically to some form of QUEUED */ svr_evaljobstate(*pjob, newstate, newsubstate, 0); svr_setjobstate(pjob, newstate, newsubstate, FALSE); } else { set_statechar(pjob); } sprintf(log_buf, "%s:1", __func__); lock_sv_qs_mutex(server.sv_qs_mutex, log_buf); if ((rc = svr_enquejob(pjob, TRUE, -1, false)) == PBSE_NONE) { int len; snprintf(log_buf, sizeof(log_buf), msg_init_substate, pjob->ji_qs.ji_substate); len = strlen(log_buf); snprintf(log_buf + len, sizeof(log_buf) - len, "%s%s", msg_init_queued, pjob->ji_qs.ji_queue); log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf); } else { /* Oops, this should never happen */ if ((rc != PBSE_JOB_RECYCLED) && (rc != PBSE_BADDEPEND)) { snprintf(log_buf, sizeof(log_buf), "%s; job %s queue %s", msg_err_noqueue, pjob->ji_qs.ji_jobid, pjob->ji_qs.ji_queue); log_err(rc, __func__, log_buf); } unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf); if ((rc != PBSE_JOB_RECYCLED) && (rc != PBSE_BADDEPEND)) job_abt(&pjob, log_buf); lock_sv_qs_mutex(server.sv_qs_mutex, log_buf); /* NOTE: pjob freed but dangling pointer remains */ } snprintf(log_buf, sizeof(log_buf), "%s:1", __func__); unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf); return(rc); } /* END pbsd_init_reque() */ /* * Catch core dump signals - set core size so we can see what happened! * * Chris Samuel - VPAC * csamuel@vpac.org - 29th July 2003 */ void catch_abort( int sig) /* I */ { struct rlimit rlimit; struct sigaction act; /* * Reset ourselves to the default signal handler to try and * prevent recursive core dumps. */ sigemptyset(&act.sa_mask); act.sa_flags = 0; act.sa_handler = SIG_DFL; sigaction(SIGSEGV, &act, NULL); sigaction(SIGBUS, &act, NULL); sigaction(SIGFPE, &act, NULL); sigaction(SIGILL, &act, NULL); sigaction(SIGTRAP, &act, NULL); sigaction(SIGSYS, &act, NULL); log_err(sig, "mom_main", (char *)"Caught fatal core signal"); rlimit.rlim_cur = RLIM_INFINITY; rlimit.rlim_max = RLIM_INFINITY; setrlimit(RLIMIT_CORE, &rlimit); abort(); return; } /* END catch_abort() */ void change_logs_handler(int sig) { run_change_logs = TRUE; return; } /* * changs_logs - signal handler for SIGHUP * Causes the accounting file and log file to be closed and reopened. * Thus the old one can be renamed. */ void change_logs() { long record_job_info = FALSE; run_change_logs = FALSE; acct_close(); pthread_mutex_lock(&log_mutex); log_close(1); log_open(log_file, path_log); pthread_mutex_unlock(&log_mutex); acct_open(acct_file); get_svr_attr_l(SRV_ATR_RecordJobInfo, &record_job_info); if (record_job_info) { pthread_mutex_lock(&job_log_mutex); job_log_open(job_log_file, path_jobinfo_log); pthread_mutex_unlock(&job_log_mutex); } return; } /* * change_log_level - signal handler for SIGUSR! and SIGUSR2 * Increases log level if SIGUSR1 is received. * Decreases log level if SIGUSR2 is received. * Variable plogenv tells us whether or not PBSLOGLEVEL was specified * If it was not then we will update the server log level pbs_attribute * which allows qmgr to see the current log level value */ void change_log_level( int sig) { char log_buf[LOCAL_LOG_BUF_SIZE]; long level = 0; get_svr_attr_l(SRV_ATR_LogLevel, &level); if (sig == SIGUSR1) { /* increase log level */ if (plogenv == NULL) LOGLEVEL = level; LOGLEVEL = MIN(LOGLEVEL + 1, 7); if (plogenv == NULL) { set_svr_attr(SRV_ATR_LogLevel, &LOGLEVEL); } } else if (sig == SIGUSR2) { /* decrease log level */ if (plogenv == NULL) LOGLEVEL = level; LOGLEVEL = MAX(LOGLEVEL - 1, 0); if (plogenv == NULL) { set_svr_attr(SRV_ATR_LogLevel, &LOGLEVEL); } } sprintf(log_buf, "received signal %d: adjusting loglevel to %d", sig, LOGLEVEL); log_record( PBSEVENT_SYSTEM | PBSEVENT_FORCE, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf); return; } /* END change_log_level() */ /* * stop_me - signal handler for all caught signals which terminate the server * * Record the signal so an log_event call can be made outside of * the handler, and set the server state to indicate we should shut down. */ /*ARGSUSED*/ void stop_me( int sig) { long state = SV_STATE_SHUTSIG; set_svr_attr(SRV_ATR_State, &state); return; } int chk_save_file( char *filename) { struct stat sb; if (*filename == '.') { return(-1); } if (stat(filename, &sb) == -1) { return(errno); } if (S_ISREG(sb.st_mode)) { return(0); } return(-1); } /* * resume_net_move - call net_move() to complete the routing of a job * This is invoked via a work task created on recovery of a job * in JOB_SUBSTATE_TRNOUTCM state. */ void resume_net_move( struct work_task *ptask) { char *jobid = (char *)ptask->wt_parm1; job *pjob; if (jobid != NULL) { if((pjob = svr_find_job(jobid, FALSE)) == NULL) return; mutex_mgr job_mgr(pjob->ji_mutex,true); net_move(pjob, 0); free(jobid); } free(ptask->wt_mutex); free(ptask); } /* END resume_net_move() */ /* * rm_files - on an RECOV_CREATE, remove all files under the specified * directory (path_priv) and any subdirectory except under "jobs". */ void rm_files( char *dirname) { DIR *dir; int i; struct stat stb; struct dirent *pdirt; char path[1024]; char log_buf[LOCAL_LOG_BUF_SIZE]; /* list of directories in which files are removed */ static const char *byebye[] = { "acl_groups", "acl_hosts", "acl_svr", "acl_users", "hostlist", "queues", NULL }; /* keep as last entry */ dir = opendir(dirname); if (dir != NULL) { while ((pdirt = readdir(dir)) != NULL) { snprintf(path, sizeof(path), "%s/%s", dirname, pdirt->d_name); if (stat(path, &stb) == 0) { if (S_ISDIR(stb.st_mode)) { for (i = 0; byebye[i]; ++i) { if (strcmp(pdirt->d_name, byebye[i]) == 0) { rm_files(path); } } } else if (unlink(path) == -1) { sprintf(log_buf, "cannot unlink %s", path); log_err(errno, "pbsd_init", log_buf); } } } closedir(dir); } return; } /* END rm_files() */ /* * init_abt_job() - log and email owner message that job is being aborted at * initialization; then purge job (must be called after job is enqueued. */ void init_abt_job( job *pjob) { log_event( PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, msg_init_abt); svr_mailowner(pjob, MAIL_ABORT, MAIL_NORMAL, msg_init_abt); svr_job_purge(pjob); return; } /* END init_abt_job() */ /* * This just reads in the server attributes from the server db. */ int recov_svr_attr( int type) /* type of initialization */ { int rc; const char *suffix_slash = "/"; if (type != RECOV_CREATE) { /* Open the server database (save file) and read it in */ if (path_priv == NULL) { path_priv = build_path((char *)path_home, (char *)PBS_SVR_PRIVATE, (char *)suffix_slash); } if (path_svrdb == NULL) { path_svrdb = build_path(path_priv, (char *)PBS_SERVERDB, NULL); } if (svr_resc_def == NULL) { rc = init_resc_defs(); if (rc != 0) { log_err(rc, "pbsd_init", msg_init_baddb); return(-1); } } if (((rc = chk_save_file(path_svrdb))!= 0) || ((rc = svr_recov_xml(path_svrdb, TRUE)) == -1)) { log_err(rc, __func__, msg_init_baddb); return(-1); } } return(0); } /* END recov_svr_attr() */