/* * OpenPBS (Portable Batch System) v2.3 Software License * * Copyright (c) 1999-2000 Veridian Information Solutions, Inc. * All rights reserved. * * --------------------------------------------------------------------------- * For a license to use or redistribute the OpenPBS software under conditions * other than those described below, or to purchase support for this software, * please contact Veridian Systems, PBS Products Department ("Licensor") at: * * www.OpenPBS.org +1 650 967-4675 sales@OpenPBS.org * 877 902-4PBS (US toll-free) * --------------------------------------------------------------------------- * * This license covers use of the OpenPBS v2.3 software (the "Software") at * your site or location, and, for certain users, redistribution of the * Software to other sites and locations. Use and redistribution of * OpenPBS v2.3 in source and binary forms, with or without modification, * are permitted provided that all of the following conditions are met. * After December 31, 2001, only conditions 3-6 must be met: * * 1. Commercial and/or non-commercial use of the Software is permitted * provided a current software registration is on file at www.OpenPBS.org. * If use of this software contributes to a publication, product, or * service, proper attribution must be given; see www.OpenPBS.org/credit.html * * 2. Redistribution in any form is only permitted for non-commercial, * non-profit purposes. There can be no charge for the Software or any * software incorporating the Software. Further, there can be no * expectation of revenue generated as a consequence of redistributing * the Software. * * 3. Any Redistribution of source code must retain the above copyright notice * and the acknowledgment contained in paragraph 6, this list of conditions * and the disclaimer contained in paragraph 7. * * 4. Any Redistribution in binary form must reproduce the above copyright * notice and the acknowledgment contained in paragraph 6, this list of * conditions and the disclaimer contained in paragraph 7 in the * documentation and/or other materials provided with the distribution. * * 5. Redistributions in any form must be accompanied by information on how to * obtain complete source code for the OpenPBS software and any * modifications and/or additions to the OpenPBS software. The source code * must either be included in the distribution or be available for no more * than the cost of distribution plus a nominal fee, and all modifications * and additions to the Software must be freely redistributable by any party * (including Licensor) without restriction. * * 6. All advertising materials mentioning features or use of the Software must * display the following acknowledgment: * * "This product includes software developed by NASA Ames Research Center, * Lawrence Livermore National Laboratory, and Veridian Information * Solutions, Inc. * Visit www.OpenPBS.org for OpenPBS software support, * products, and information." * * 7. DISCLAIMER OF WARRANTY * * THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT * ARE EXPRESSLY DISCLAIMED. * * IN NO EVENT SHALL VERIDIAN CORPORATION, ITS AFFILIATED COMPANIES, OR THE * U.S. GOVERNMENT OR ANY OF ITS AGENCIES BE LIABLE FOR ANY DIRECT OR INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * This license will be governed by the laws of the Commonwealth of Virginia, * without reference to its choice of law rules. */ #include /* the master config generated by configure */ #include #include #include #include #include #include #include #include #include #include "dis.h" #include "libpbs.h" #include "portability.h" #include #include #include #include #include #include "list_link.h" #include "server_limits.h" #include "attribute.h" #include "resource.h" #include "pbs_job.h" #include "log.h" #include "credential.h" #include "batch_request.h" #include "net_connect.h" #include "svrfunc.h" #include "mom_mach.h" #include "mom_func.h" #include "pbs_error.h" #include "pbs_proto.h" #include "rpp.h" #ifdef ENABLE_CPA #include "pbs_cpa.h" #endif #ifdef PENABLE_LINUX26_CPUSETS #include "pbs_cpuset.h" #endif /* External Functions */ /* External Globals */ extern char *path_epilog; extern char *path_epiloguser; extern char *path_epilogp; extern char *path_epiloguserp; extern char *path_jobs; extern unsigned int default_server_port; extern tlist_head svr_alljobs, mom_polljobs; extern int exiting_tasks; extern char *msg_daemonname; extern int termin_child; extern struct connection svr_conn[]; extern int resc_access_perm; extern char *path_aux; extern int LOGLEVEL; extern char *PJobSubState[]; extern char mom_host[]; extern int PBSNodeCheckProlog; extern int PBSNodeCheckEpilog; /* external prototypes */ u_long resc_used(job *, char *, u_long(*f) (resource *)); static void preobit_reply (int); static void obit_reply (int); extern int tm_reply (int, int, tm_event_t); extern u_long addclient (char *); extern void encode_used (job *, tlist_head *); extern void encode_flagged_attrs (job *, tlist_head *); extern void job_nodes (job *); extern int task_recov (job *); extern void mom_server_all_update_stat(void); extern void check_state(int); extern int mom_open_socket_to_jobs_server (job *, char *, void (*) (int)); extern int mark_for_resend (job *); extern void checkpoint_partial(job *pjob); extern void mom_checkpoint_recover(job *pjob); extern void clear_down_mom_servers(); extern int is_mom_server_down(pbs_net_t); extern void set_mom_server_down(pbs_net_t); extern int no_mom_servers_down(); extern char *get_local_script_path(job *pjob, char *base); /* END external prototypes */ /* * catch_child() - the signal handler for SIGCHLD. * * To keep the signal handler simple for * SIGCHLD - just indicate there was one. */ void catch_child( int sig) { termin_child = 1; return; } /* END catch_child() */ hnodent *get_node( job *pjob, tm_node_id nodeid) { int i; vnodent *vp = pjob->ji_vnods; for (i = 0;i < pjob->ji_numvnod;i++, vp++) { if (vp->vn_node == nodeid) { return(vp->vn_host); } } return(NULL); } /* END get_node() */ /** * For all jobs in MOM * ignore job if job's pbs_server is down * for all tasks in job * ignore task if task state is not exiting * if task is master, send kill to all sisters * process TM client obits * if I am sister, do sister stuff and continue * kill_job * contact server and register preobit_reply() * set job substate to JOB_SUBSTATE_PREOBIT * * @see main_loop() - parent * @see scan_for_terminated() * @see post_epilog() * @see preobit_reply() - registered to handle response to preobit * @see send_sisters() - child * @see kill_job() - child * * Obit Overview: * - main_loop() * - scan_for_terminated() * uses waitpid() to detect completed children * First Pass: catches SIGCHLD of job executable to identify when job * tasks terminate, issues kill_task(), and marks job task ti_status * as TI_STATE_EXITED which is detected and processed inside of * scan_for_exiting() * Second Pass: catches SIGCHLD for job epilog child and exec's * job's ji_mompost (post_epilog) * * - scan_for_exiting() * called after scan_for_terminated and looks at jobs to identify which * have exiting tasks. Sends kill to all sisters via send_sisters(), * sets job substate to JOB_SUBSTATE_EXITING, issues kill_job, and * then sets job substate to JOB_SUBSTATE_PREOBIT. This routine then * creates the preobit message and sends it to pbs_server. * registers preobit_reply() as socket handler * * - preobit_reply() * o validates server response to preobit message * If the server returns unknown job id (it may have been purged), * then the job is deleted from the mom: mom_deljob -> job_purge, * and that should be it for the job. Otherwise, we fork: * - fork_me() * o parent registers post_epilog in job ji_mompost attribute, sets job * substate to JOB_SUBSTATE_OBIT, and registers post_epilogue handler. * This handler will be invoked when the waitpid in scan_for_terminated * catches a SIGCHLD for the job epilog invoked by the child. * o child runs run_pelog() * * - post_epilog() * sends obit to pbs_server and registers obit_reply() as connection handler * * - obit_reply() * sets job substate to EXITED * END OF JOB LIFECYCLE * * when job completes and process id goes away scan_for_terminated() * * OVERALL FLOW: * - scan_for_terminating() - PHASE I * - KILL TASK * - scan_for_exiting() * - KILL SISTERS * - SEND PREOBIT TO PBS_SERVER * - preobit_reply() - FORK AND EXEC EPILOG * - scan_for_terminating() - PHASE II * - post_epilog() * - SEND OBIT TO PBS_SERVER * - obit_reply() * * STATE TRANSITIONS: * JOB_SUBSTATE_RUNNING (42) * JOB_SUBSTATE_EXITING (50) - scan_for_exiting() * JOB_SUBSTATE_PREOBIT (57) - scan_for_exiting() * JOB_SUBSTATE_OBIT (58) - preobit_reply() */ void scan_for_exiting(void) { char *id = "scan_for_exiting"; int found_one = 0; job *nxjob; job *pjob; task *ptask; obitent *pobit; int sock; char *cookie; u_long gettime(resource *); u_long getsize(resource *); task *task_find(job *, tm_task_id); int im_compose(int, char *, char *, int, tm_event_t, tm_task_id); static int ForceObit = -1; /* boolean - if TRUE, ObitsAllowed will be enforced */ static int ObitsAllowed = 1; int NumSisters; /* ** Look through the jobs. Each one has it's tasks examined ** and if the job is EXITING, it meets it's fate depending ** on whether this is the Mother Superior or not. */ if (LOGLEVEL >= 3) { log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, id, "searching for exiting jobs"); } if (ForceObit == -1) { /* NOTE: Allow sites to locally specify obit groupings larger than 1. */ /* Remove after 6/1/2008 if no further obit issues are encountered */ char *ptr; if ((ptr = getenv("TORQUEFORCESEND")) != NULL) { int tmpI; tmpI = (int)strtol(ptr, NULL, 10); if (tmpI > 0) ObitsAllowed = tmpI; ForceObit = 1; } else { ForceObit = 1; } } /* END if (ForceObit == -1) */ clear_down_mom_servers(); for (pjob = (job *)GET_NEXT(svr_alljobs);pjob != NULL;pjob = nxjob) { nxjob = (job *)GET_NEXT(pjob->ji_alljobs); /* * Bypass job if it is for a server that we know is down */ if (is_mom_server_down(pjob->ji_qs.ji_un.ji_momt.ji_svraddr)) { if (LOGLEVEL >= 3) { snprintf(log_buffer, 1024, "not checking job %s - server is down", pjob->ji_qs.ji_jobid); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, id, log_buffer); } continue; } /* ** If a checkpoint with aborts is active, ** skip it. We don't want to report any obits ** until we know that the whole thing worked. */ if (pjob->ji_flags & MOM_CHECKPOINT_ACTIVE) { continue; } /* ** If the job has had an error doing a checkpoint with ** abort, the MOM_CHECKPOINT_POST flag will be on. */ if (pjob->ji_flags & MOM_CHECKPOINT_POST) { checkpoint_partial(pjob); continue; } if (!(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_flags & ATR_VFLAG_SET)) { continue; } cookie = pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str; /* ** Check each EXITED task. They transition to DEAD here. */ for ( ptask = (task *)GET_NEXT(pjob->ji_tasks); ptask != NULL; ptask = (task *)GET_NEXT(ptask->ti_jobtask)) { if (ptask->ti_qs.ti_status != TI_STATE_EXITED) continue; /* ** Check if it is the top shell. */ if (ptask->ti_qs.ti_parenttask == TM_NULL_TASK) { /* master task is in state TI_STATE_EXITED */ if(pjob->ji_qs.ji_un.ji_momt.ji_exitstat != JOB_EXEC_OVERLIMIT) { pjob->ji_qs.ji_un.ji_momt.ji_exitstat = ptask->ti_qs.ti_exitstat; } LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "job was terminated"); NumSisters = send_sisters(pjob, IM_KILL_JOB); if (NumSisters == 0) { /* no sisters contacted - should be a serial job */ if (LOGLEVEL >= 3) { LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "no sisters contacted - setting job substate to EXITING"); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING; job_save(pjob, SAVEJOB_QUICK); } else if (LOGLEVEL >= 3) { snprintf(log_buffer, 1024, "master task has exited - sent kill job request to %d sisters", NumSisters); LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } } /* END if (ptask->ti_qs.ti_parenttask == TM_NULL_TASK) */ /* ** process any TM client obits waiting. */ pobit = (obitent *)GET_NEXT(ptask->ti_obits); while (pobit != NULL) { hnodent *pnode; pnode = get_node(pjob, pobit->oe_info.fe_node); /* see if this is me or another MOM */ if (pjob->ji_nodeid == pnode->hn_node) { task *tp; /* send event to local child */ tp = task_find(pjob, pobit->oe_info.fe_taskid); assert(tp != NULL); if (tp->ti_fd != -1) { tm_reply(tp->ti_fd, IM_ALL_OKAY, pobit->oe_info.fe_event); diswsi(tp->ti_fd, ptask->ti_qs.ti_exitstat); DIS_tcp_wflush(tp->ti_fd); } } else if (pnode->hn_stream != -1) { /* ** Send a response over to MOM ** whose child sent the request. */ im_compose( pnode->hn_stream, pjob->ji_qs.ji_jobid, cookie, IM_ALL_OKAY, pobit->oe_info.fe_event, pobit->oe_info.fe_taskid); diswsi(pnode->hn_stream, ptask->ti_qs.ti_exitstat); rpp_flush(pnode->hn_stream); } delete_link(&pobit->oe_next); free(pobit); pobit = (obitent *)GET_NEXT(ptask->ti_obits); } /* END while (pobit) */ ptask->ti_fd = -1; ptask->ti_qs.ti_status = TI_STATE_DEAD; if (LOGLEVEL >= 3) { LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "task is dead"); } task_save(ptask); } /* END for (ptask) */ /* ** Look to see if the job has terminated. If it is ** in any state other than EXITING continue on. */ if ((pjob->ji_qs.ji_substate != JOB_SUBSTATE_EXITING) && (pjob->ji_qs.ji_substate != JOB_SUBSTATE_NOTERM_REQUE)) { if (LOGLEVEL >= 3) { snprintf(log_buffer, 1024, "job is in non-exiting substate %s, no obit sent at this time", PJobSubState[pjob->ji_qs.ji_substate]); LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } if(pjob->ji_qs.ji_substate == JOB_SUBSTATE_EXITED) { /* This is quasi odd. If we are in an EXITED substate then we already sent the obit to the server and it replied. But we have not received a PBS_BATCH_DeleteJob request from the server. If we have tasks to complete continue. But if there are no tasks left to run we need to delete the job.*/ ptask = (task *)GET_NEXT(pjob->ji_tasks); if(ptask == NULL) mom_deljob(pjob); } continue; } /* ** Look to see if I am a regular sister. If so, ** check to see if there is an obit event to ** send back to mother superior. ** Otherwise, I need to wait for her to send a KILL_JOB ** so I can send the obit (unless she died). */ if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) { int stream; stream = (pjob->ji_hosts == NULL) ? -1 : pjob->ji_hosts[0].hn_stream; /* ** Check to see if I'm still in touch with ** the mother superior. If not, I'm just going to ** get rid of this job. */ if (stream == -1) { if (LOGLEVEL >= 3) { LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "connection to server lost - no obit sent - job will be purged"); } if(pjob->ji_qs.ji_substate != JOB_SUBSTATE_NOTERM_REQUE) { kill_job(pjob, SIGKILL, id, "connection to server lost - no obit sent"); } job_purge(pjob); continue; } /* ** No event waiting for sending info to MS ** so I'll just sit tight. */ if (pjob->ji_obit == TM_NULL_EVENT) { if (LOGLEVEL >= 3) { LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "obit method not specified for job - no obit sent"); } continue; } /* ** Check to see if any tasks are running. */ ptask = (task *)GET_NEXT(pjob->ji_tasks); while (ptask != NULL) { if (ptask->ti_qs.ti_status == TI_STATE_RUNNING) break; ptask = (task *)GET_NEXT(ptask->ti_jobtask); } /* Still somebody there so don't send it yet. */ if (ptask != NULL) { if (LOGLEVEL >= 3) { LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "one or more running tasks found - no obit sent"); } continue; } if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) && pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long) { if (run_pelog(PE_EPILOGUSER, path_epiloguserp, pjob, PE_IO_TYPE_NULL) != 0) { log_err(-1, id, "user parallel epilog failed"); } if (run_pelog(PE_EPILOG, path_epilogp, pjob, PE_IO_TYPE_NULL) != 0) { log_err(-1, id, "parallel epilog failed"); } } else { if (run_pelog(PE_EPILOGUSER, path_epiloguserp, pjob, PE_IO_TYPE_STD) != 0) { log_err(-1, id, "parallel user epilog failed"); } if (run_pelog(PE_EPILOG, path_epilogp, pjob, PE_IO_TYPE_STD) != 0) { log_err(-1, id, "parallel epilog failed"); } } /* ** No tasks running ... format and send a ** reply to the mother superior and get rid of ** the job. */ im_compose( stream, pjob->ji_qs.ji_jobid, cookie, IM_ALL_OKAY, pjob->ji_obit, TM_NULL_TASK); diswul(stream, resc_used(pjob, "cput", gettime)); diswul(stream, resc_used(pjob, "mem", getsize)); diswul(stream, resc_used(pjob, "vmem", getsize)); rpp_flush(stream); if (LOGLEVEL >= 6) { LOG_EVENT( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "all tasks complete - purging job as sister"); } DBPRT(("all tasks complete - purging job as sister (%s)\n", pjob->ji_qs.ji_jobid)); job_purge(pjob); continue; } /* END if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) */ /* * At this point, we know we are Mother Superior for this * job which is EXITING. Time for it to die. */ pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend; if(pjob->ji_qs.ji_substate != JOB_SUBSTATE_NOTERM_REQUE) kill_job(pjob, SIGKILL, id, "local task termination detected"); else { ptask = (task *)GET_NEXT(pjob->ji_tasks); while (ptask != NULL) { if (ptask->ti_qs.ti_status == TI_STATE_RUNNING) { if (LOGLEVEL >= 4) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "kill_job found a task to kill"); } if (pjob->ji_qs.ji_un.ji_momt.ji_exitstat != 0) ptask->ti_qs.ti_exitstat = pjob->ji_qs.ji_un.ji_momt.ji_exitstat; else ptask->ti_qs.ti_exitstat = 0; /* assume successful completion */ ptask->ti_qs.ti_status = TI_STATE_EXITED; task_save(ptask); } ptask = (task *)GET_NEXT(ptask->ti_jobtask); } /* END while (ptask != NULL) */ } #ifdef ENABLE_CPA if (CPADestroyPartition(pjob) != 0) continue; #endif delete_link(&pjob->ji_jobque); /* unlink for poll list */ /* * + Open connection to the Server (for the Job Obituary) * + Set the connection to call obit_reply when the reply * arrives. * + fork child process, parent looks for more terminated jobs. * Child: * + Run the epilogue script (if one) * + Send the Job Obit Request (notice). */ sock = mom_open_socket_to_jobs_server(pjob, id, preobit_reply); if (sock < 0) { if ((errno == EINPROGRESS) || (errno == ETIMEDOUT) || (errno == EINTR)) { sprintf(log_buffer, "connect to server unsuccessful after 5 seconds - will retry"); } /* * continue through the jobs loop since we can have jobs for multiple * servers. Keep track that this server is down so we don't try to * process any more jobs for it. We will leave it's exiting_tasks set * so Mom will retry Obit when server is available */ set_mom_server_down(pjob->ji_qs.ji_un.ji_momt.ji_svraddr); continue; } /* END if (sock < 0) */ if (LOGLEVEL >= 2) { log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "sending preobit jobstat"); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_PREOBIT; #ifdef TREMOVEME if (ForceObit == 0) { if (found_one++ >= ObitsAllowed) { /* do not exceed max obits per iteration limit */ break; } } #endif /* TREMOVEME */ /* send the pre-obit job stat request */ DIS_tcp_setup(sock); if (encode_DIS_ReqHdr(sock, PBS_BATCH_StatusJob, pbs_current_user) || encode_DIS_Status(sock, pjob->ji_qs.ji_jobid, NULL) || encode_DIS_ReqExtend(sock, NULL)) { /* FAILURE */ log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "failed creating preobit message"); return; } DIS_tcp_wflush(sock); if (found_one++ >= ObitsAllowed) { /* do not exceed max obits per iteration limit */ break; } } /* END for (pjob) */ if ((pjob == NULL) && (no_mom_servers_down())) { /* search finished */ exiting_tasks = 0; /* went through all jobs */ } return; } /* END scan_for_exiting() */ /** * Send obit to server. * * @see scan_for_terminated() - calls post_epilog() via ji_mompost job attribute * @see mom_open_socket_to_jobs_server() - child * @see obit_reply() - registered handler for obit connection * * @see scan_for_exiting() for Obit overview */ int post_epilogue( job *pjob, /* I */ int ev) /* I exit value (only used to determine if retrying obit) */ { char id[] = "post_epilogue"; int sock; struct batch_request *preq; if (LOGLEVEL >= 2) { sprintf(log_buffer, "preparing obit message for job %s", pjob->ji_qs.ji_jobid); LOG_EVENT( PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, id, log_buffer); } /* open new connection - register obit_reply as handler */ sock = mom_open_socket_to_jobs_server(pjob, id, obit_reply); if (sock < 0) { /* FAILURE */ if ((errno == EINTR) || (errno == ETIMEDOUT) || (errno == EINPROGRESS)) { /* transient failure - server/network up but busy... retry */ int retrycount; for (retrycount = 0;retrycount < 2;retrycount++) { sock = mom_open_socket_to_jobs_server(pjob, id, obit_reply); if (sock >= 0) break; } /* END for (retrycount) */ } if (sock < 0) { /* We are trying to send obit, but failed - where is this retried? * Answer: In the main_loop examine_all_jobs_to_resend() tries * every so often to send the obit. This would work for recovered * jobs also. */ if (ev != MOM_OBIT_RETRY) { mark_for_resend(pjob); } return(1); } } /* send the job obiturary notice to the server */ preq = alloc_br(PBS_BATCH_JobObit); if (preq == NULL) { /* FAILURE */ sprintf(log_buffer, "cannot allocate memory for obit message"); LOG_EVENT( PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, id, log_buffer); return(1); } CLEAR_HEAD(preq->rq_ind.rq_jobobit.rq_attr); resc_access_perm = ATR_DFLAG_RDACC; encode_used(pjob, &preq->rq_ind.rq_jobobit.rq_attr); encode_flagged_attrs(pjob, &preq->rq_ind.rq_jobobit.rq_attr); strcpy(preq->rq_ind.rq_jobobit.rq_jid, pjob->ji_qs.ji_jobid); if (pjob->ji_job_is_being_rerun) { pjob->ji_qs.ji_un.ji_momt.ji_exitstat = 0; } preq->rq_ind.rq_jobobit.rq_status = pjob->ji_qs.ji_un.ji_momt.ji_exitstat; if (LOGLEVEL > 5) { sprintf(log_buffer, "job id %s exit status %d", preq->rq_ind.rq_jobobit.rq_jid, preq->rq_ind.rq_jobobit.rq_status); LOG_EVENT( PBSEVENT_DEBUG, PBSEVENT_JOB, id, log_buffer); } DIS_tcp_setup(sock); if (encode_DIS_ReqHdr(sock, PBS_BATCH_JobObit, pbs_current_user) || encode_DIS_JobObit(sock, preq) || encode_DIS_ReqExtend(sock, 0)) { /* FAILURE */ sprintf(log_buffer, "cannot create obit message for job %s", pjob->ji_qs.ji_jobid); LOG_EVENT( PBSEVENT_DEBUG, PBS_EVENTCLASS_REQUEST, id, log_buffer); close(sock); free_br(preq); return(1); } DIS_tcp_wflush(sock); /* does flush close sock? */ free_br(preq); /* SUCCESS */ /* Who closes sock and unsets pjob->ji_momhandle? * Answer: This gets done in the message reply handler, obit_reply. */ log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "obit sent to server"); return(0); } /* END post_epilog() */ /** * preobit_reply * * @see scan_for_exiting() - registers this routine as handler * @see mom_deljob() - child * @see run_pelog() - child * * This function is a message handler that is hooked to a server connection. * The connection is established in scan_for_exiting() where all jobs * are examined. A socket connection to the server is opened, an obit * message is sent to the server, and then at some later time, the server * sends back a reply and we end up here. * * What is the correct response if an EOF is detected? */ static void preobit_reply( int sock) /* I */ { char id[] = "preobit_reply"; pid_t cpid; job *pjob; int irtn; struct batch_request *preq; struct brp_status *pstatus; svrattrl *sattrl; int runepilogue = 0; int deletejob = 0; int jobiscorrupt = 0; char *path_epiloguserjob; resource *presc; /* struct batch_status *bsp = NULL; */ log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, id, "top of preobit_reply"); /* read and decode the reply */ preq = alloc_br(PBS_BATCH_StatusJob); CLEAR_HEAD(preq->rq_ind.rq_status.rq_attr); while ((irtn = DIS_reply_read(sock, &preq->rq_reply)) && (errno == EINTR)); if (irtn != 0) { sprintf(log_buffer, "DIS_reply_read/decode_DIS_replySvr failed, rc=%d sock=%d", irtn, sock); /* NOTE: irtn=11 indicates EOF */ /* NOTE: errno not set, thus log_err say success in spite of failure */ log_err(errno, id, log_buffer); preq->rq_reply.brp_code = -1; } else { log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, id, "DIS_reply_read/decode_DIS_replySvr worked, top of while loop"); } /* find the job that triggered this req */ pjob = (job *)GET_NEXT(svr_alljobs); while (pjob != NULL) { if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_PREOBIT) && (pjob->ji_momhandle == sock)) { /* located job that triggered req from server */ break; } pjob = (job *)GET_NEXT(pjob->ji_alljobs); } /* END while (pjob != NULL) */ if (pjob == NULL) { /* FAILURE - cannot locate job that triggered req */ log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, id, "cannot locate job that triggered req"); free_br(preq); shutdown(sock, SHUT_RDWR); close_conn(sock); return; } /* END if (pjob != NULL) */ /* we've got a job in PREOBIT and matches the socket, now inspect the results of the job stat */ switch (preq->rq_reply.brp_code) { case PBSE_CLEANEDOUT: case PBSE_UNKJOBID: /* this is the simple case of the job being purged from the server */ sprintf(log_buffer, "preobit_reply, unknown on server, deleting locally"); deletejob = 1; break; /* not reached */ case PBSE_NONE: log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, id, "in while loop, no error from job stat"); if (preq->rq_reply.brp_choice == BATCH_REPLY_CHOICE_Status) { pstatus = (struct brp_status *)GET_NEXT(preq->rq_reply.brp_un.brp_status); } else { sprintf(log_buffer, "BUG: preq->rq_reply.brp_choice==%d", preq->rq_reply.brp_choice); break; } if (pstatus == NULL) { sprintf(log_buffer, "BUG: pstatus==NULL"); break; } if (strcmp(pstatus->brp_objname, pjob->ji_qs.ji_jobid)) { sprintf(log_buffer, "BUG: mismatched jobid in preobit_reply (%s != %s)", pstatus->brp_objname, pjob->ji_qs.ji_jobid); break; } /* determine if job has exechost set - if set, and task 0 host is X ... */ sattrl = (svrattrl *)GET_NEXT(pstatus->brp_attr); jobiscorrupt = 1; while (sattrl != NULL) { if (!strcmp(sattrl->al_name, ATTR_exechost)) { jobiscorrupt = 0; if (strncmp( sattrl->al_value, pjob->ji_hosts[0].hn_host, strlen(pjob->ji_hosts[0].hn_host))) { /* the job was re-run elsewhere */ sprintf(log_buffer, "first host DOES NOT match me: %s != %s", sattrl->al_value, pjob->ji_hosts[0].hn_host); deletejob = 1; } else { /* job was run locally */ runepilogue = 1; } break; } sattrl = (svrattrl *)GET_NEXT(sattrl->al_link); } /* END while (sattrl != NULL) */ if (jobiscorrupt == 1) { /* runepilogue = 1; */ } break; case - 1: sprintf(log_buffer, "EOF? received attempting to process obit reply"); break; default: /* not sure what happened */ sprintf(log_buffer, "something bad happened: %d", preq->rq_reply.brp_code); break; } /* END switch (preq->rq_reply.brp_code) */ /* we've inspected the server's response and can now act */ free_br(preq); shutdown(sock, SHUT_RDWR); close_conn(sock); if (deletejob == 1) { log_record( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); if (!(pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) || (pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long == 0)) { int x; /* dummy */ /* do this if not interactive */ job_unlink_file(pjob, std_file_name(pjob, StdOut, &x)); job_unlink_file(pjob, std_file_name(pjob, StdErr, &x)); job_unlink_file(pjob, std_file_name(pjob, Checkpoint, &x)); } mom_deljob(pjob); return; } if (!runepilogue) { log_record( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING; pjob->ji_momhandle = -1; exiting_tasks = 1; /* job exit will be picked up again */ return; } /* at this point, server gave us a valid response so we can run epilogue */ if (LOGLEVEL >= 2) { log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "performing job clean-up in preobit_reply()"); } cpid = fork_me(-1); if (cpid < 0) { /* FAILURE */ log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "fork failed in preobit_reply"); return; } if (cpid > 0) { /* parent - mark that job epilog subtask has been launched */ /* NOTE: pjob->ji_mompost will be executed in scan_for_terminated() */ pjob->ji_qs.ji_substate = JOB_SUBSTATE_OBIT; pjob->ji_momsubt = cpid; pjob->ji_mompost = post_epilogue; pjob->ji_momhandle = -1; if (LOGLEVEL >= 2) { snprintf(log_buffer, 1024, "epilog subtask created with pid %d - substate set to JOB_SUBSTATE_OBIT - registered post_epilogue", cpid); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } return; } /* child */ /* check epilog script */ if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) && pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long) { /* job is interactive */ presc = find_resc_entry( &pjob->ji_wattr[(int)JOB_ATR_resource], find_resc_def(svr_resc_def, "epilogue", svr_resc_size)); if((presc != NULL)) if((presc->rs_value.at_flags & ATR_VFLAG_SET) && (presc->rs_value.at_val.at_str != NULL)) { path_epiloguserjob = get_local_script_path(pjob, presc->rs_value.at_val.at_str); if(path_epiloguserjob) { if (run_pelog(PE_EPILOGUSERJOB, path_epiloguserjob, pjob, PE_IO_TYPE_NULL) != 0) { log_err(-1, id, "user local epilog failed"); } free(path_epiloguserjob); } } if (run_pelog(PE_EPILOGUSER, path_epiloguser, pjob, PE_IO_TYPE_NULL) != 0) { log_err(-1, id, "user epilog failed - interactive job"); } if (run_pelog(PE_EPILOG, path_epilog, pjob, PE_IO_TYPE_NULL) != 0) { log_err(-1, id, "system epilog failed - interactive job"); } } else { /* job is not interactive */ int rc; presc = find_resc_entry( &pjob->ji_wattr[(int)JOB_ATR_resource], find_resc_def(svr_resc_def, "epilogue", svr_resc_size)); if((presc != NULL)) if((presc->rs_value.at_flags & ATR_VFLAG_SET) && (presc->rs_value.at_val.at_str != NULL)) { path_epiloguserjob = get_local_script_path(pjob, presc->rs_value.at_val.at_str); if(path_epiloguserjob) { if (run_pelog(PE_EPILOGUSERJOB, path_epiloguserjob, pjob, PE_IO_TYPE_STD) != 0) { log_err(-1, id, "user local epilog failed"); } free(path_epiloguserjob); } } if (run_pelog(PE_EPILOGUSER, path_epiloguser, pjob, PE_IO_TYPE_STD) != 0) { log_err(-1, id, "user epilog failed"); } if ((rc = run_pelog(PE_EPILOG, path_epilog, pjob, PE_IO_TYPE_STD)) != 0) { sprintf(log_buffer, "system epilog failed w/rc=%d", rc); log_err(-1, id, log_buffer); } } /* END else (jobisinteractive) */ exit(0); } /* END preobit_reply() */ /* * obit_reply * * This function is a message handler that is hooked to a server connection. * The connection is established in post_epilogue(). * * A socket connection to the server is opened, a job obiturary notice * message is sent to the server, and then at some later time, the server * sends back a reply and we end up here. * * On success, this routine sets the job's substate to EXITED * * @see post_epilogue() - registers obit_reply via add_conn() */ static void obit_reply( int sock) /* I */ { int irtn; job *nxjob; job *pjob; attribute *pattr; struct batch_request *preq; int x; /* dummy */ /* read and decode the reply */ preq = alloc_br(PBS_BATCH_JobObit); CLEAR_HEAD(preq->rq_ind.rq_jobobit.rq_attr); while ((irtn = DIS_reply_read(sock, &preq->rq_reply)) && (errno == EINTR)); if (irtn != 0) { /* NOTE: irtn is of type DIS_* in include/dis.h, see dis_emsg[] */ sprintf(log_buffer, "DIS_reply_read failed, rc=%d sock=%d", irtn, sock); log_err(errno, "obit_reply", log_buffer); preq->rq_reply.brp_code = -1; } /* find the job associated with the reply by the socket number */ /* saved in the job structure, ji_momhandle */ pjob = (job *)GET_NEXT(svr_alljobs); while (pjob != NULL) { nxjob = (job *)GET_NEXT(pjob->ji_alljobs); if ((pjob->ji_qs.ji_substate == JOB_SUBSTATE_OBIT) && (pjob->ji_momhandle == sock)) { /* Clear out destination so we know job is not on mom any more */ pjob->ji_qs.ji_destin[0] = '\0'; switch (preq->rq_reply.brp_code) { case PBSE_NONE: /* normal ack, mark job as exited */ pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITED; job_save(pjob, SAVEJOB_QUICK); if (LOGLEVEL >= 4) { LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "job obit acknowledge received - substate set to JOB_SUBSTATE_EXITED"); } break; case PBSE_ALRDYEXIT: /* have already told the server before recovery */ /* the server will contact us to continue */ if (LOGLEVEL >= 7) { log_record( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "setting already exited job substate to EXITED"); } pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITED; job_save(pjob, SAVEJOB_QUICK); break; case PBSE_CLEANEDOUT: /* all jobs discarded by server, discard job */ pattr = &pjob->ji_wattr[(int)JOB_ATR_interactive]; if (((pattr->at_flags & ATR_VFLAG_SET) == 0) || (pattr->at_val.at_long == 0)) { /* do this if not interactive */ job_unlink_file(pjob, std_file_name(pjob, StdOut, &x)); job_unlink_file(pjob, std_file_name(pjob, StdErr, &x)); job_unlink_file(pjob, std_file_name(pjob, Checkpoint, &x)); } mom_deljob(pjob); break; case - 1: /* FIXME - causes epilogue to be run twice! */ pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING; exiting_tasks = 1; break; default: { char tmpLine[1024]; switch (preq->rq_reply.brp_code) { case PBSE_BADSTATE: sprintf(tmpLine, "server rejected job obit - unexpected job state"); break; case PBSE_SYSTEM: sprintf(tmpLine, "server rejected job obit - server not ready for job completion"); break; default: sprintf(tmpLine, "server rejected job obit - %d", preq->rq_reply.brp_code); break; } /* END switch (preq->rq_reply.brp_code) */ log_ext(-1,"obit_reply",tmpLine,LOG_ALERT); LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, tmpLine); } /* END BLOCK */ mom_deljob(pjob); break; } /* END switch (preq->rq_reply.brp_code) */ break; } /* END if (...) */ pjob = nxjob; } /* END while (pjob != NULL) */ if (pjob == NULL) { LOG_EVENT( PBSEVENT_ERROR, PBS_EVENTCLASS_REQUEST, "obit reply", "Job not found for obit reply"); } free_br(preq); shutdown(sock, 2); close_conn(sock); if (PBSNodeCheckEpilog) { check_state(1); mom_server_all_update_stat(); } return; } /* END obit_reply() */ /* * init_abort_jobs - on mom initialization, recover all running jobs. * * Called on initialization * If the -p option was given (default) (recover = JOB_RECOV_RUNNING), Mom will allow the jobs * to continue to run. She depends on detecting when they terminate * via the slow poll method rather than SIGCHLD. * * If the -r option was given (recover = JOB_RECOV_TERM_REQUE), MOM is * recovering on a running system and the session id of the jobs should be valid; * the job processes are killed and the job is re-queued * * If -q was given (recover = JOB_RECOV_RQUE), it is assumed that the whole * system, not just MOM, is coming up, the session ids are not valid; * so no attempt is made to kill the job processes. But the jobs are * terminated and requeued. * * If the -P option was given (recover == JOB_RECOV_DELETE), no attempt is * made to recover the jobs. The jobs are deleted from the queue. */ void init_abort_jobs( int recover) /* I (boolean) */ { char *id = "init_abort_jobs"; DIR *dir; int i; int j; int sisters, rc; struct dirent *pdirent; job *pj; char *job_suffix = JOB_FILE_SUFFIX; int job_suf_len = strlen(job_suffix); char *psuffix; if (LOGLEVEL >= 6) { sprintf(log_buffer, "%s: recover=%d", id, recover); log_record( PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); } dir = opendir(path_jobs); if (dir == NULL) { sprintf(log_buffer, "cannot open job directory '%s'", path_jobs); log_record( PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); exit(1); } while ((pdirent = readdir(dir)) != NULL) { if ((i = strlen(pdirent->d_name)) <= job_suf_len) continue; psuffix = pdirent->d_name + i - job_suf_len; if (strcmp(psuffix, job_suffix)) continue; pj = job_recov(pdirent->d_name); if (pj == NULL) { sprintf(log_buffer, "%s: NULL job pointer", id); log_record( PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); continue; } /* code moved to here because even when we're canceling jobs, if there is a * user epilogue we'll attempt to become the user, so if ji_grpcache is * NULL then we'll get a crash */ if (pj->ji_grpcache == NULL) { DBPRT(("init_abort_jobs: setting grpcache for job %s\n", pj->ji_qs.ji_jobid)); if (check_pwd(pj) == NULL) { /* somehow a job that was legally executing (had a password entry) * no longer has a password entry?? */ snprintf(log_buffer, sizeof(log_buffer), "job %s no longer has valid password entry - deleting", pj->ji_qs.ji_jobid); log_err(-1, id, log_buffer); mom_deljob(pj); continue; } } /* PW: mpiexec patch - set the globid so mom does not coredump in response to tm_spawn */ set_globid(pj, NULL); append_link(&svr_alljobs, &pj->ji_alljobs, pj); job_nodes(pj); rc = task_recov(pj); if (LOGLEVEL >= 2) { sprintf(log_buffer, "task recovery %s for job %s, rc=%d", (rc == 0) ? "succeeded" : "failed", pj->ji_qs.ji_jobid, rc); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, id, log_buffer); } mom_checkpoint_recover(pj); /* * make sure we trust connections from sisters in case we get an * IM request before we get the real addr list from server. * Note: this only works after the job_nodes() call above. */ for (j = 0;j < pj->ji_numnodes;j++) { if (LOGLEVEL >= 6) { sprintf(log_buffer, "%s: adding client %s", id, pj->ji_hosts[j].hn_host); log_record( PBSEVENT_ERROR, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buffer); } addclient(pj->ji_hosts[j].hn_host); } /* END for (j) */ if (LOGLEVEL >= 4) { sprintf(log_buffer, "successfully recovered job %s", pj->ji_qs.ji_jobid); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, id, log_buffer); } if ((recover != JOB_RECOV_RUNNING) && (recover != JOB_RECOV_DELETE) && ((pj->ji_qs.ji_substate == JOB_SUBSTATE_RUNNING) || (pj->ji_qs.ji_substate == JOB_SUBSTATE_PRERUN) || (pj->ji_qs.ji_substate == JOB_SUBSTATE_SUSPEND) || (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITED) || (pj->ji_qs.ji_substate == JOB_SUBSTATE_NOTERM_REQUE) || (pj->ji_qs.ji_substate == JOB_SUBSTATE_EXITING))) { if (LOGLEVEL >= 2) { sprintf(log_buffer, "job %s recovered in active state %s (full recover not enabled)", pj->ji_qs.ji_jobid, PJobSubState[pj->ji_qs.ji_substate]); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, id, log_buffer); } if (recover == JOB_RECOV_TERM_REQUE) /* -r option was used to start mom */ { kill_job(pj, SIGKILL, id, "recover is non-zero"); } /* ** Check to see if I am Mother Superior. The ** JOB_SVFLG_HERE flag is overloaded for MOM ** for this purpose. ** If I'm an ordinary sister, just throw the job ** away. If I am MS, send a KILL_JOB request to ** any sisters that happen to still be alive. */ if ((pj->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0) { if (LOGLEVEL >= 2) { sprintf(log_buffer, "local host is not mother-superior, deleting job %s", pj->ji_qs.ji_jobid); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, id, log_buffer); } mom_deljob(pj); continue; } if (LOGLEVEL >= 2) { sprintf(log_buffer, "setting job state to exiting for job %s in state %s", pj->ji_qs.ji_jobid, PJobSubState[pj->ji_qs.ji_substate]); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, id, log_buffer); } /* set exit status to: * JOB_EXEC_INITABT - init abort and no checkpoint * JOB_EXEC_INITRST - init and checkpoint, no mig * JOB_EXEC_INITRMG - init and checkpoint, migrate * to indicate recovery abort */ if (pj->ji_qs.ji_svrflags & (JOB_SVFLG_CHECKPOINT_FILE | JOB_SVFLG_CHECKPOINT_MIGRATEABLE)) { #if PBS_CHKPT_MIGRATE pj->ji_qs.ji_un.ji_momt.ji_exitstat = JOB_EXEC_INITRMG; #else pj->ji_qs.ji_un.ji_momt.ji_exitstat = JOB_EXEC_INITRST; #endif } else { pj->ji_qs.ji_un.ji_momt.ji_exitstat = JOB_EXEC_INITABT; } sisters = pj->ji_numnodes - 1; /* ** A sisterhood exists... send a KILL request. */ if (sisters > 0) { DBPRT(("init_abort_jobs: Sending to sisters\n")) pj->ji_resources = (noderes *)calloc(sisters, sizeof(noderes)); send_sisters(pj, IM_KILL_JOB); continue; } /* If mom was initialized with a -r any running processes have already been killed. We set substate to JOB_SUBSTATE_NOTERM_REQUE so scan_for_exiting will not try to kill the running processes for this job */ pj->ji_qs.ji_substate = JOB_SUBSTATE_NOTERM_REQUE; job_save(pj, SAVEJOB_QUICK); exiting_tasks = 1; } /* END if ((recover != 2) && ...) */ else if (recover == JOB_RECOV_RUNNING || recover == JOB_RECOV_DELETE) { /* * add: 8/11/03 David.Singleton@anu.edu.au * * Lots of job structure components need to be * initialized if we are leaving this job * running, this is just a few. * Modified to accomodate JOB_RECOV_DELETE option * 01/13/2009 Ken Nielson knielson@adaptivecomputing.com */ if (LOGLEVEL >= 2 && recover == JOB_RECOV_RUNNING) { sprintf(log_buffer, "attempting to recover job %s in state %s", pj->ji_qs.ji_jobid, PJobSubState[pj->ji_qs.ji_substate]); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, id, log_buffer); } sisters = pj->ji_numnodes - 1; if (sisters > 0) pj->ji_resources = (noderes *)calloc(sisters, sizeof(noderes)); if ((sisters > 0) && (recover == JOB_RECOV_RUNNING)) append_link(&mom_polljobs, &pj->ji_jobque, pj); } } /* while ((pdirent = readdir(dir)) != NULL) */ closedir(dir); return; } /* END init_abort_jobs() */ /* * mom_deljob - delete the job entry, MOM no longer knows about the job */ void mom_deljob( job *pjob) /* I (modified) */ { #ifdef _CRAY /* remove any temporary directories */ rmtmpdir(pjob->ji_qs.ji_jobid); #endif /* _CRAY */ if (LOGLEVEL >= 3) { sprintf(log_buffer, "deleting job %s in state %s", pjob->ji_qs.ji_jobid, PJobSubState[pjob->ji_qs.ji_substate]); log_record( PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buffer); } job_purge(pjob); return; } /* END mom_deljob() */ /* END catch_child() */