/*
*         OpenPBS (Portable Batch System) v2.3 Software License
*
* Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
* All rights reserved.
*
* ---------------------------------------------------------------------------
* For a license to use or redistribute the OpenPBS software under conditions
* other than those described below, or to purchase support for this software,
* please contact Veridian Systems, PBS Products Department ("Licensor") at:
*
*    www.OpenPBS.org  +1 650 967-4675                  sales@OpenPBS.org
*                        877 902-4PBS (US toll-free)
* ---------------------------------------------------------------------------
*
* This license covers use of the OpenPBS v2.3 software (the "Software") at
* your site or location, and, for certain users, redistribution of the
* Software to other sites and locations.  Use and redistribution of
* OpenPBS v2.3 in source and binary forms, with or without modification,
* are permitted provided that all of the following conditions are met.
* After December 31, 2001, only conditions 3-6 must be met:
*
* 1. Commercial and/or non-commercial use of the Software is permitted
*    provided a current software registration is on file at www.OpenPBS.org.
*    If use of this software contributes to a publication, product, or
*    service, proper attribution must be given; see www.OpenPBS.org/credit.html
*
* 2. Redistribution in any form is only permitted for non-commercial,
*    non-profit purposes.  There can be no charge for the Software or any
*    software incorporating the Software.  Further, there can be no
*    expectation of revenue generated as a consequence of redistributing
*    the Software.
*
* 3. Any Redistribution of source code must retain the above copyright notice
*    and the acknowledgment contained in paragraph 6, this list of conditions
*    and the disclaimer contained in paragraph 7.
*
* 4. Any Redistribution in binary form must reproduce the above copyright
*    notice and the acknowledgment contained in paragraph 6, this list of
*    conditions and the disclaimer contained in paragraph 7 in the
*    documentation and/or other materials provided with the distribution.
*
* 5. Redistributions in any form must be accompanied by information on how to
*    obtain complete source code for the OpenPBS software and any
*    modifications and/or additions to the OpenPBS software.  The source code
*    must either be included in the distribution or be available for no more
*    than the cost of distribution plus a nominal fee, and all modifications
*    and additions to the Software must be freely redistributable by any party
*    (including Licensor) without restriction.
*
* 6. All advertising materials mentioning features or use of the Software must
*    display the following acknowledgment:
*
*     "This product includes software developed by NASA Ames Research Center,
*     Lawrence Livermore National Laboratory, and Veridian Information
*     Solutions, Inc.
*     Visit www.OpenPBS.org for OpenPBS software support,
*     products, and information."
*
* 7. DISCLAIMER OF WARRANTY
*
* THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT
* ARE EXPRESSLY DISCLAIMED.
*
* IN NO EVENT SHALL VERIDIAN CORPORATION, ITS AFFILIATED COMPANIES, OR THE
* U.S. GOVERNMENT OR ANY OF ITS AGENCIES BE LIABLE FOR ANY DIRECT OR INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/
/*
 * Functions which provide basic operation on the job structure
 *
 * Included public functions are:
 *
 *   job_abt   abort (remove from server) a job
 *   job_alloc    allocate job struct and initialize defaults
 *   job_free   free space allocated to the job structure and its
 *    childern structures.
 *   job_purge   purge job from server
 *
 *   job_clone    clones a job (for use with job_arrays)
 *   job_clone_wt work task for cloning a job
 *
 * Include private function:
 *   job_init_wattr() initialize job working attribute array to "unspecified"
 *
 * NOTE: for multi-threaded TORQUE, all functions in here except find_job assume that
 * the caller holds any relevant mutexes
 */

#include <pbs_config.h>   /* the master config generated by configure */

#include <sys/param.h>
#include <sys/stat.h>
#include <ctype.h>
#include <errno.h>
#include <assert.h>

#ifndef SIGKILL
#include <signal.h>
#endif
#if __STDC__ != 1
#include <memory.h>
#endif

#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <dirent.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>

#include "pbs_ifl.h"
#include "list_link.h"
#include "work_task.h"
#include "attribute.h"
#include "resource.h"
#include "server_limits.h"
#include "server.h"
#include "queue.h"
#include "batch_request.h"
#include "pbs_job.h"
#include "log.h"
#include "../lib/Liblog/pbs_log.h"
#include "../lib/Liblog/log_event.h"
#include "pbs_error.h"
#include "svrfunc.h"
#include "acct.h"
#include "net_connect.h"
#include "portability.h"
#include "array.h"
#include "pbs_job.h"
#include "resizable_array.h"
#include "dynamic_string.h"
#include "svr_func.h" /* get_svr_attr_* */


#ifndef TRUE
#define TRUE 1
#define FALSE 0
#endif

#define MAXLINE 1024
extern int LOGLEVEL;


int conn_qsub(char *, long, char *);
void job_purge(job *);

/* External functions */

extern void cleanup_restart_file(job *);
extern struct batch_request *setup_cpyfiles(struct batch_request *,job *,char*,char *,int,int);
extern int job_log_open(char *, char *);
extern int log_job_record(char *buf);
extern void check_job_log(struct work_task *ptask);

/* Local Private Functions */

static void job_init_wattr(job *);

/* Global Data items */
struct all_jobs        alljobs;
extern struct all_jobs array_summary;

int check_job_log_started = 0;

extern struct server   server;
extern int queue_rank;
extern char *path_arrays;
extern char *msg_abt_err;
extern char *path_jobs;
extern char *path_spool;
extern char *path_aux;
extern char  server_name[];
extern int   LOGLEVEL;

extern char *path_checkpoint;
extern char *path_jobinfo_log;
extern char *log_file;
extern char *job_log_file;


void send_qsub_delmsg(

  job  *pjob,  /* I */
  char *text)  /* I */

  {
  char      *phost;
  attribute *pattri;
  int        qsub_sock;

  phost = arst_string("PBS_O_HOST", &pjob->ji_wattr[JOB_ATR_variables]);

  if ((phost == NULL) || ((phost = strchr(phost, '=')) == NULL))
    {
    return;
    }

  pattri = &pjob->ji_wattr[JOB_ATR_interactive];

  qsub_sock = conn_qsub(phost + 1, pattri->at_val.at_long, NULL);

  if (qsub_sock < 0)
    {
    return;
    }

  if (write(qsub_sock, "PBS: ", 5) == -1)
    {
    return;
    }

  if (write(qsub_sock, text, strlen(text)) == -1)
    {
    return;
    }

  close(qsub_sock);

  return;
  }  /* END send_qsub_delmsg() */



/*
 * remtree - remove a tree (or single file)
 *
 * returns  0 on success
 *  -1 on failure
 */

int remtree(

  char *dirname)

  {
  static char    id[] = "remtree";
  DIR           *dir;

  struct dirent *pdir;
  char           namebuf[MAXPATHLEN];
  char           log_buf[LOCAL_LOG_BUF_SIZE];
  char          *filnam;
  int            i;
  int            rtnv = 0;

#if defined(HAVE_STRUCT_STAT64) && defined(HAVE_STAT64) && defined(LARGEFILE_WORKS)
  struct stat64  sb;
#else
  struct stat    sb;
#endif

#if defined(HAVE_STRUCT_STAT64) && defined(HAVE_STAT64) && defined(LARGEFILE_WORKS)

  if (lstat64(dirname, &sb) == -1)
#else
  if (lstat(dirname, &sb) == -1)
#endif
    {

    if (errno != ENOENT)
      log_err(errno, id, "stat");

    return(-1);
    }

  if (S_ISDIR(sb.st_mode))
    {
    if ((dir = opendir(dirname)) == NULL)
      {
      if (errno != ENOENT)
        log_err(errno, id, "opendir");

      return(-1);
      }

    strcpy(namebuf, dirname);

    strcat(namebuf, "/");

    i = strlen(namebuf);

    filnam = &namebuf[i];

    while ((pdir = readdir(dir)) != NULL)
      {
      if ((pdir->d_name[0] == '.') &&
          ((pdir->d_name[1] == '\0') || (pdir->d_name[1] == '.')))
        continue;

      strcpy(filnam, pdir->d_name);

#if defined(HAVE_STRUCT_STAT64) && defined(HAVE_STAT64) && defined(LARGEFILE_WORKS)
      if (lstat64(namebuf, &sb) == -1)
#else
      if (lstat(namebuf, &sb) == -1)
#endif
        {
        log_err(errno, id, "stat");

        rtnv = -1;

        continue;
        }

      if (S_ISDIR(sb.st_mode))
        {
        rtnv = remtree(namebuf);
        }
      else if (unlink(namebuf) < 0)
        {
        if (errno != ENOENT)
          {
          sprintf(log_buf, "unlink failed on %s", namebuf);

          log_err(errno, id, log_buf);

          rtnv = -1;
          }
        }
      else if (LOGLEVEL >= 7)
        {
        sprintf(log_buf, "unlink(1) succeeded on %s", namebuf);

        log_ext(-1, id, log_buf, LOG_DEBUG);
        }
      }    /* END while ((pdir = readdir(dir)) != NULL) */

    closedir(dir);

    if (rmdir(dirname) < 0)
      {
      if ((errno != ENOENT) && (errno != EINVAL))
        {
        sprintf(log_buf, "rmdir failed on %s", dirname);

        log_err(errno, id, log_buf);

        rtnv = -1;
        }
      }
    else if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "rmdir succeeded on %s", dirname);

      log_ext(-1, id, log_buf, LOG_DEBUG);
      }
    }
  else if (unlink(dirname) < 0)
    {
    sprintf(log_buf, "unlink failed on %s", dirname);

    log_err(errno, id, log_buf);

    rtnv = -1;
    }
  else if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "unlink(2) succeeded on %s", dirname);

    log_ext(-1, id, log_buf, LOG_DEBUG);
    }

  return(rtnv);
  }  /* END remtree() */




/*
 * job_abt - abort a job
 *
 * The job removed from the system and a mail message is sent
 * to the job owner.
 */

/* NOTE:  this routine is called under the following conditions:
 * 1) by req_deletejob whenever deleting a job that is not running,
 *    not transitting, not exiting and does not have a checkpoint
 *    file on the mom.
 * 2) by req_deletearray whenever deleting a job that is not running,
 *    not transitting, not in prerun, not exiting and does not have a
 *    checkpoint file on the mom.
 * 3) by close_quejob when the server fails to enqueue the job.
 * 4) by array_delete_wt for prerun jobs that hang around too long and
 *    do not have a checkpoint file on the mom.
 * 5) by pbsd_init when recovering jobs.
 * 6) by svr_movejob when done routing jobs around.
 * 7) by queue_route when trying toroute any "ready" jobs in a specific queue.
 * 8) by req_shutdown when trying to shutdown.
 * 9) by req_register when the request oparation is JOB_DEPEND_OP_DELETE.
 */

int job_abt(

  job  **pjobp, /* I (modified/freed) */
  char  *text)  /* I (optional) */

  {
  char *myid = "job_abt";
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  int   old_state;
  int   old_substate;
  int   rc = 0;

  job  *pjob = *pjobp;

  /* save old state and update state to Exiting */

  old_state = pjob->ji_qs.ji_state;
  old_substate = pjob->ji_qs.ji_substate;

  /* notify user of abort if notification was requested */

  if (text != NULL)
    {
    /* req_delete sends own mail and acct record */

    account_record(PBS_ACCT_ABT, pjob, "");
    svr_mailowner(pjob, MAIL_ABORT, MAIL_NORMAL, text);

    if ((pjob->ji_qs.ji_state == JOB_STATE_QUEUED) &&
        ((pjob->ji_wattr[JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
         pjob->ji_wattr[JOB_ATR_interactive].at_val.at_long))
      {
      /* interactive and not yet running... send a note to qsub */

      send_qsub_delmsg(pjob, text);
      }
    }

  if (old_state == JOB_STATE_RUNNING)
    {
    svr_setjobstate(pjob, JOB_STATE_RUNNING, JOB_SUBSTATE_ABORT, FALSE);

    if ((rc = issue_signal(&pjob, "SIGKILL", release_req, 0)) != 0)
      {
      if (pjob != NULL)
        {
        sprintf(log_buf, msg_abt_err, pjob->ji_qs.ji_jobid, old_substate);
        
        log_err(-1, myid, log_buf);
        
        if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0)
          {
          /* notify creator that job is exited */
          
          pjob->ji_wattr[JOB_ATR_state].at_val.at_char = 'E';
          
          issue_track(pjob);
          }
        
        if (pjob->ji_wattr[JOB_ATR_depend].at_flags & ATR_VFLAG_SET)
          {
          depend_on_term(pjob);
          }
        
        /* update internal array bookeeping values */
        if ((pjob->ji_arraystruct != NULL) &&
            (pjob->ji_is_array_template == FALSE))
          {
          job_array *pa = get_jobs_array(&pjob);
          
          if (pjob != NULL)
            {
            update_array_values(pa,pjob,old_state,aeTerminate);
            
            if (LOGLEVEL >= 7)
              {
              sprintf(log_buf, "unlocked ai_mutex: %s", myid);
              log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
              }
            pthread_mutex_unlock(pa->ai_mutex);
            }
          }
      
        if (pjob != NULL)
          job_purge(pjob);

        *pjobp = NULL;
        }
      }
    }
  else if ((old_state == JOB_STATE_TRANSIT) &&
           (old_substate == JOB_SUBSTATE_TRNOUT))
    {
    /* I don't know of a case where this could happen */

    sprintf(log_buf, msg_abt_err,
      pjob->ji_qs.ji_jobid,
      old_substate);

    log_err(-1, myid, log_buf);
    }
  else
    {
    svr_setjobstate(pjob, JOB_STATE_EXITING, JOB_SUBSTATE_ABORT, FALSE);

    if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) == 0)
      {
      /* notify creator that job is exited */

      issue_track(pjob);
      }

    if (pjob->ji_wattr[JOB_ATR_depend].at_flags & ATR_VFLAG_SET)
      {
      depend_on_term(pjob);
      }

    /* update internal array bookeeping values */
    if ((pjob->ji_arraystruct != NULL) &&
        (pjob->ji_is_array_template == FALSE))
      {
      job_array *pa = get_jobs_array(&pjob);

      if (pjob != NULL)
        {
        update_array_values(pa,pjob,old_state,aeTerminate);
        
        pthread_mutex_unlock(pa->ai_mutex);
        if (LOGLEVEL >= 7)
          {
          sprintf(log_buf, "unlocked ai_mutex: %s", myid);
          log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
          }
        }
      }

    if (pjob != NULL)
      job_purge(pjob);

    *pjobp = NULL;
    }

  return(rc);
  }  /* END job_abt() */


/*
 * conn_qsub - connect to the qsub that submitted this interactive job
 * return >= 0 on SUCCESS, < 0 on FAILURE
 * (this was moved from resmom/mom_inter.c)
 */



int conn_qsub(

  char *hostname,  /* I */
  long  port,      /* I */
  char *EMsg)      /* O (optional,minsize=1024) */

  {
  pbs_net_t hostaddr;
  int s;

  int flags;
  int local_errno = 0;

  if (EMsg != NULL)
    EMsg[0] = '\0';

  if ((hostaddr = get_hostaddr(&local_errno, hostname)) == (pbs_net_t)0)
    {
#if !defined(H_ERRNO_DECLARED) && !defined(_AIX)
    extern int h_errno;
#endif

    /* FAILURE */

    if (EMsg != NULL)
      {
      snprintf(EMsg, 1024, "cannot get address for host '%s', h_errno=%d",
               hostname,
               h_errno);
      }

    return(-1);
    }

  s = client_to_svr(hostaddr, (unsigned int)port, 0, EMsg);

  /* NOTE:  client_to_svr() can return 0 for SUCCESS */

  /* assume SUCCESS requires s > 0 (USC) was 'if (s >= 0)' */
  /* above comment not enabled */

  if (s < 0)
    {
    /* FAILURE */

    return(-1);
    }

  /* SUCCESS */

  /* this socket should be blocking */

  flags = fcntl(s, F_GETFL);

  flags &= ~O_NONBLOCK;

  fcntl(s, F_SETFL, flags);

  return(s);
  }  /* END conn_qsub() */




/*
 * job_alloc - allocate space for a job structure and initialize working
 * attribute to "unset"
 *
 * Returns: pointer to structure or null is space not available.
 *
 * @see job_init_wattr() - child
 */

job *job_alloc(void)

  {
  job *pj = (job *)calloc(1, sizeof(job));
  
  if (pj == NULL)
    {
    log_err(errno, "job_alloc", "no memory");
    
    return(NULL);
    }

  pj->ji_mutex = (pthread_mutex_t *)calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(pj->ji_mutex,NULL);
  pthread_mutex_lock(pj->ji_mutex);

  pj->ji_qs.qs_version = PBS_QS_VERSION;

  CLEAR_HEAD(pj->ji_rejectdest);
  pj->ji_arraystruct = NULL;
  pj->ji_is_array_template = FALSE;

  pj->ji_momhandle = -1;  /* mark mom connection invalid */

  /* set the working attributes to "unspecified" */
  job_init_wattr(pj);
  
  return(pj);
  }  /* END job_alloc() */




/*
 * job_free - free job structure and its various sub-structures
 */

void job_free(

  job *pj)  /* I (modified) */

  {
  int               i;

  badplace         *bp;
  char              log_buf[LOCAL_LOG_BUF_SIZE];

  if (LOGLEVEL >= 8)
    {
    sprintf(log_buf, "freeing job");

    log_record(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,pj->ji_qs.ji_jobid,log_buf);
    }

  /* remove any calloc working attribute space */
  for (i = 0;i < JOB_ATR_LAST;i++)
    {
    job_attr_def[i].at_free(&pj->ji_wattr[i]);
    }

  i = -1;

  /* free any bad destination structs */
  bp = (badplace *)GET_NEXT(pj->ji_rejectdest);

  while (bp != NULL)
    {
    delete_link(&bp->bp_link);

    free(bp);

    bp = (badplace *)GET_NEXT(pj->ji_rejectdest);
    }

  /* move to the recycling structure - deleting right away can cause a race
   * condition where two threads are pending on the same job. Thread 1 gets 
   * the lock and then deletes the job, but thread 2 gets the job's lock as
   * the job is freed, causing segfaults. We use the recycler and the 
   * ji_being_recycled flag to solve this problem --dbeer */
  insert_into_recycler(pj);

  pthread_mutex_unlock(pj->ji_mutex);

  return;
  }  /* END job_free() */





/*
 * job_clone - create a clone of a job for use with job arrays
 */

job *job_clone(

  job       *template_job, /* I */  /* job to clone */
  job_array *pa,           /* I */  /* array which the job is a part of */
  int        taskid)  /* I */

  {
  char log_buf[LOCAL_LOG_BUF_SIZE];

  job  *pnewjob;
  attribute tempattr;

  char  *oldid;
  char  *hostname;
  char  *bracket;
  char  *tmpstr;
  char   basename[PBS_JOBBASE+1];
  char   namebuf[MAXPATHLEN + 1];
  char   buf[256];
  char  *pc;
  int    fds;

  int    i;
  int    slen;
  int    release_mutex = FALSE;


  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "taskid %d", taskid);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
    }
  if (taskid > PBS_MAXJOBARRAY)
    {
    log_err(-1, __func__, "taskid out of range");

    return(NULL);
    }

  if ((pnewjob = job_alloc()) == NULL)
    {
    log_err(errno, __func__, "no memory");

    return(NULL);
    }

  job_init_wattr(pnewjob);

  /* new job structure is allocated,
     now we need to copy the old job, but modify based on taskid */
  CLEAR_HEAD(pnewjob->ji_rejectdest);
  pnewjob->ji_modified = 1;   /* struct changed, needs to be saved */

  /* copy the fixed size quick save information */
  memcpy(&pnewjob->ji_qs, &template_job->ji_qs, sizeof(struct jobfix));

  /* find the job id for the cloned job */
  if ((oldid = strdup(template_job->ji_qs.ji_jobid)) == NULL)
    {
    log_err(ENOMEM, __func__, "no memory");
    job_free(pnewjob);

    return(NULL);
    }

  bracket = index(oldid,'[');
  hostname = index(oldid, '.');

  *bracket = '\0';
  hostname++;

  pnewjob->ji_qs.ji_jobid[PBS_MAXSVRJOBID-1] = '\0';

  snprintf(pnewjob->ji_qs.ji_jobid, PBS_MAXSVRJOBID, "%s[%d].%s",
    oldid, taskid, hostname);


  /* update the job filename
   * We could optimize the sub-jobs to all use the same file. We would need a
   * way to track the number of tasks still using the job file so we know when
   * to delete it.
   */

  /*
   * make up new job file name, it is based on the new jobid
   */

  snprintf(basename, PBS_JOBBASE, "%s-%d.%s", oldid, taskid, hostname);
  free(oldid);

  do
    {
    snprintf(namebuf, sizeof(namebuf), "%s%s%s",
      path_jobs, basename, JOB_FILE_SUFFIX);

    fds = open(namebuf, O_CREAT | O_EXCL | O_WRONLY, 0600);

    if (fds < 0)
      {
      if (errno == EEXIST)
        {
        pc = basename + strlen(basename) - 1;

        while (!isprint((int)*pc) || (*pc == '-'))
          {
          pc--;

          if (pc <= basename)
            {
            /* FAILURE */

            log_err(errno, __func__, "job file is corrupt");
            job_free(pnewjob);

            return(NULL);
            }
          }

        (*pc)++;
        }
      else
        {
        /* FAILURE */

        log_err(errno, __func__, "cannot create job file");
        job_free(pnewjob);

        return(NULL);
        }
      }
    }
  while (fds < 0);

  close(fds);

  strcpy(pnewjob->ji_qs.ji_fileprefix, basename);

  /* copy job attributes. some of these are going to have to be modified */

  for (i = 0; i < JOB_ATR_LAST; i++)
    {
    if (template_job->ji_wattr[i].at_flags & ATR_VFLAG_SET)
      {
      if ((i == JOB_ATR_errpath) || (i == JOB_ATR_outpath) || (i == JOB_ATR_jobname))
        {
        /* modify the errpath and outpath */

        slen = strlen(template_job->ji_wattr[i].at_val.at_str);

        tmpstr = (char*)calloc(sizeof(char), (slen + PBS_MAXJOBARRAYLEN + 1));

        sprintf(tmpstr, "%s-%d",
                template_job->ji_wattr[i].at_val.at_str,
                taskid);

        clear_attr(&tempattr, &job_attr_def[i]);

        job_attr_def[i].at_decode(
          &tempattr,
          NULL,
          NULL,
          tmpstr,
          ATR_DFLAG_ACCESS);

        job_attr_def[i].at_set(
          &pnewjob->ji_wattr[i],
          &tempattr,
          SET);

        job_attr_def[i].at_free(&tempattr);

        free(tmpstr);
        }
      else
        {
        job_attr_def[i].at_set(
          &(pnewjob->ji_wattr[i]),
          &(template_job->ji_wattr[i]),
          SET);
        }
      }
    }

  /* put a system hold on the job.  we'll take the hold off once the
   * entire array is cloned. We don't want any of the jobs to run and
   * complete before the whole thing is cloned. This is in case we run into
   * a problem during setting up the array and want to abort before any of
   * the jobs run */
  pnewjob->ji_wattr[JOB_ATR_hold].at_val.at_long |= HOLD_a;
  pnewjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;

  /* set JOB_ATR_job_array_id */
  pnewjob->ji_wattr[JOB_ATR_job_array_id].at_val.at_long = taskid;
  pnewjob->ji_wattr[JOB_ATR_job_array_id].at_flags |= ATR_VFLAG_SET;

  /* set PBS_ARRAYID enironment variable */
  clear_attr(&tempattr, &job_attr_def[JOB_ATR_variables]);

  sprintf(buf, "PBS_ARRAYID=%d", taskid);

  job_attr_def[JOB_ATR_variables].at_decode(&tempattr,
      NULL,
      NULL,
      buf,
      0);

  job_attr_def[JOB_ATR_variables].at_set(
    &pnewjob->ji_wattr[JOB_ATR_variables],
    &tempattr,
    INCR);

  job_attr_def[JOB_ATR_variables].at_free(&tempattr);

  /* we need to put the cloned job into the array */
  if (pa == NULL)
    {
    release_mutex = TRUE;
    
    pa = get_array(template_job->ji_qs.ji_jobid);
    if (pa == NULL)
      {
      job_free(pnewjob);
      return(NULL);
      }
    }

  pa->job_ids[taskid] = strdup(pnewjob->ji_qs.ji_jobid);
  pnewjob->ji_arraystruct = pa;

  if (release_mutex == TRUE)
    {
    pthread_mutex_unlock(pa->ai_mutex);
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "unlocked ai_mutex: %s", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pnewjob->ji_qs.ji_jobid, log_buf);
      }
    }

  return(pnewjob);
  } /* END job_clone() */



#ifndef CLONE_BATCH_SIZE
#define CLONE_BATCH_SIZE 256
#endif /* CLONE_BATCH_SIZE */



/*
 * job_clone_wt - worktask to clone jobs for job array
 */

void job_clone_wt(

  struct work_task *ptask)

  {
  char                log_buf[LOCAL_LOG_BUF_SIZE];
  job                *template_job;
  job                *pjob;
  job                *pjobclone;
  char               *jobid;

  int                 i;
  int                 prev_index = -1;
  int                 actual_job_count = 0;
  int                 newstate;
  int                 newsub;
  int                 rc;
  char                namebuf[MAXPATHLEN];
  job_array          *pa;

  array_request_node *rn;
  int                 start;
  int                 end;

  jobid = (char *)(ptask->wt_parm1);
  free(ptask->wt_mutex);
  free(ptask);

  if (jobid == NULL)
    {
    log_err(ENOMEM, __func__, "Can't malloc");
    return;
    }

  /* don't call get_jobs_array because the template job isn't part of the array */
  if (((template_job = find_job(jobid)) == NULL) ||
      ((pa = get_jobs_array(&template_job)) == NULL))
    {
    free(jobid);
    if (template_job != NULL)
      pthread_mutex_unlock(template_job->ji_mutex);
    return;
    }

  free(jobid);

  snprintf(namebuf, sizeof(namebuf), "%s%s.AR",
    path_jobs, template_job->ji_qs.ji_fileprefix);
  pthread_mutex_unlock(template_job->ji_mutex);

  while ((rn = (array_request_node *)GET_NEXT(pa->request_tokens)) != NULL)
    {
    start = rn->start;
    end = rn->end;

    for (i = start; i <= end; i++)
      {
      pthread_mutex_lock(template_job->ji_mutex);
      pjobclone = job_clone(template_job, pa, i);
      pthread_mutex_unlock(template_job->ji_mutex);

      if (pjobclone == NULL)
        {
        log_err(-1, __func__, "unable to clone job in job_clone_wt");
        continue;
        }

      svr_evaljobstate(pjobclone, &newstate, &newsub, 1);
      svr_setjobstate(pjobclone, newstate, newsub, FALSE);

      pjobclone->ji_wattr[JOB_ATR_qrank].at_val.at_long = ++queue_rank;
      pjobclone->ji_wattr[JOB_ATR_qrank].at_flags |= ATR_VFLAG_SET;

      if ((rc = svr_enquejob(pjobclone, FALSE, prev_index)))
        {
        /* XXX need more robust error handling */
        pthread_mutex_unlock(pa->ai_mutex);
        job_purge(pjobclone);
        pthread_mutex_lock(pa->ai_mutex);
        continue;
        }

      if (job_save(pjobclone, SAVEJOB_FULL, 0) != 0)
        {
        /* XXX need more robust error handling */
        pthread_mutex_unlock(pa->ai_mutex);
        job_purge(pjobclone);
        pthread_mutex_lock(pa->ai_mutex);
        continue;
        }
      
      prev_index = get_jobs_index(&alljobs, pjobclone);
      
      pa->ai_qs.num_cloned++;
      
      rn->start++;
      
      pthread_mutex_unlock(pjobclone->ji_mutex);
      }  /* END for (i) */

    if (rn->start > rn->end)
      {
      delete_link(&rn->request_tokens_link);
      free(rn);
      }
    }    /* END while (loop) */
      
  array_save(pa);

  /* scan over all the jobs in the array and unset the hold */
  for (i = 0; i < pa->ai_qs.array_size; i++)
    {
    if (pa->job_ids[i] == NULL)
      continue;
    
    actual_job_count++;
    
    if ((pjob = find_job(pa->job_ids[i])) == NULL)
      {
      free(pa->job_ids[i]);
      pa->job_ids[i] = NULL;
      }
    else
      {
      long moab_compatible = FALSE;;
      get_svr_attr_l(SRV_ATR_MoabArrayCompatible, &moab_compatible);
      pjob->ji_wattr[JOB_ATR_hold].at_val.at_long &= ~HOLD_a;
      
      if (moab_compatible != FALSE)
        {
        /* if configured and necessary, apply a slot limit hold to all
         * jobs above the slot limit threshold */
        if ((pa->ai_qs.slot_limit != NO_SLOT_LIMIT) &&
            (actual_job_count > pa->ai_qs.slot_limit))
          {
          pjob->ji_wattr[JOB_ATR_hold].at_val.at_long |= HOLD_l;
          }
        }
      
      if (pjob->ji_wattr[JOB_ATR_hold].at_val.at_long == 0)
        {
        pjob->ji_wattr[JOB_ATR_hold].at_flags &= ~ATR_VFLAG_SET;
        }
      else
        {
        pjob->ji_wattr[JOB_ATR_hold].at_flags |= ATR_VFLAG_SET;
        }
      
      pjob->ji_modified = TRUE;
      svr_evaljobstate(pjob, &newstate, &newsub, 1);
      svr_setjobstate(pjob, newstate, newsub, FALSE);
      
      pthread_mutex_unlock(pjob->ji_mutex);
      }
    }

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf, "unlocked ai_mutex: %s", __func__);
    log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
    }
  
  pthread_mutex_unlock(pa->ai_mutex);
  }  /* END job_clone_wt */




/*
 * job_init_wattr - initialize job working attribute array
 * set the types and the "unspecified value" flag
 */

static void job_init_wattr(

  job *pj)

  {
  int i;

  for (i = 0;i < JOB_ATR_LAST;i++)
    {
    clear_attr(&pj->ji_wattr[i], &job_attr_def[i]);
    }

  return;
  }   /* END job_init_wattr() */



/*
 * cpy_checkpoint - set up a Copy Files request to transfer checkpoint files
 */

struct batch_request *cpy_checkpoint(

  struct batch_request *preq,
  job                  *pjob,
  enum job_atr          ati,  /* JOB_ATR_checkpoint_name or JOB_ATR_restart_name */
  int                   direction)

  {
  char        momfile[MAXPATHLEN+1];
  char        serverfile[MAXPATHLEN+1];
  char       *from = NULL;
  char       *to = NULL;
  char        log_buf[LOCAL_LOG_BUF_SIZE];
  attribute  *pattr;
  mode_t      saveumask = 0;
  
  pattr = &pjob->ji_wattr[ati];

  if ((pattr->at_flags & ATR_VFLAG_SET) == 0)
    {
    /* no file to transfer */
    
    return(preq);
    }
    
  /* build up the name used for SERVER file */
  snprintf(serverfile, sizeof(serverfile), "%s%s%s",
    path_checkpoint, pjob->ji_qs.ji_fileprefix, JOB_CHECKPOINT_SUFFIX);
  
  /*
   * We need to make sure the jobs checkpoint directory exists.  If it does
   * not we need to add it since this is the first time we are copying a
   * checkpoint file for this job
   */

  saveumask = umask(0000);
  if ((mkdir(serverfile, 01777) == -1) && (errno != EEXIST))
    {
    log_err(errno,"cpy_checkpoint", "Failed to create jobs checkpoint directory");
    }
  umask(saveumask);

  strcat(serverfile, "/");
  strcat(serverfile, pjob->ji_wattr[JOB_ATR_checkpoint_name].at_val.at_str);

  /* build up the name used for MOM file */

  if (pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_flags & ATR_VFLAG_SET)
    {
    snprintf(momfile, sizeof(momfile), "%s/%s%s/%s",
      pjob->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str,
      pjob->ji_qs.ji_fileprefix,
      JOB_CHECKPOINT_SUFFIX,
      pattr->at_val.at_str);
    }
  else
    {
    /* if not specified, moms path may not be the same */
    snprintf(momfile, sizeof(momfile), "%s/%s%s/%s",
      MOM_DEFAULT_CHECKPOINT_DIR,
      pjob->ji_qs.ji_fileprefix,
      JOB_CHECKPOINT_SUFFIX,
      pjob->ji_wattr[JOB_ATR_checkpoint_name].at_val.at_str);
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "Job has NO checkpoint dir specified, using file %s", momfile);

      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }
    }

  if (direction == CKPT_DIR_OUT)
    {
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf,"Requesting checkpoint copy from MOM (%s) to SERVER (%s)",
        momfile,
        serverfile);

      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }
    }
  else
    {
    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf,"Requesting checkpoint copy from SERVER (%s) to MOM (%s)",
        serverfile,
        momfile);

      log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }
    }

  to = (char *)calloc(1, strlen(serverfile) + strlen(server_name) + 2);

  if (to == NULL)
    {
    /* FAILURE */

    /* cannot allocate memory for request this one */

    log_event(
      PBSEVENT_ERROR | PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      "ERROR:  cannot allocate 'to' memory in cpy_checkpoint");

    return(preq);
    }

  strcpy(to, server_name);
  strcat(to, ":");
  strcat(to, serverfile);

  from = (char *)calloc(1, strlen(momfile) + 1);

  if (from == NULL)
    {
    /* FAILURE */

    log_event(
      PBSEVENT_ERROR | PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      "ERROR:  cannot allocate 'from' memory for from in cpy_checkpoint");

    free(to);

    return(preq);
    }

  strcpy(from, momfile);

  if (LOGLEVEL >= 7)
    {
    sprintf(log_buf,"Checkpoint copy from (%s) to (%s)", from, to);

    log_event(PBSEVENT_JOB,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
    }

  preq = setup_cpyfiles(preq, pjob, from, to, direction, JOBCKPFILE);

  return(preq);
  }  /* END cpy_checkpoint() */




/*
 * remove_checkpoint() - request that mom delete checkpoint file for a job
 * used when the job is to be purged after file has been transferred
 */

void remove_checkpoint(

  job **pjob_ptr)  /* I */

  {
  static char          *id = "remove_checkpoint";

  struct batch_request *preq = NULL;
  char                  log_buf[LOCAL_LOG_BUF_SIZE];
  job                  *pjob = *pjob_ptr;

  preq = cpy_checkpoint(preq, pjob, JOB_ATR_checkpoint_name, CKPT_DIR_IN);

  if (preq != NULL)
    {
    /* have files to delete  */
    sprintf(log_buf,"Removing checkpoint file (%s/%s)",
      (*pjob_ptr)->ji_wattr[JOB_ATR_checkpoint_dir].at_val.at_str,
      (*pjob_ptr)->ji_wattr[JOB_ATR_checkpoint_name].at_val.at_str);

    log_ext(-1, id, log_buf, LOG_DEBUG);

    /* change the request type from copy to delete  */

    preq->rq_type = PBS_BATCH_DelFiles;

    preq->rq_extra = NULL;
    /* The preq is freed in relay_to_mom (failure)
     * or in issue_Drequest (success) */
    if (relay_to_mom(&pjob, preq, release_req) == 0)
      {
      if (pjob != NULL)
        pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_CHECKPOINT_COPIED;
      }
    else
      {
      /* log that we were unable to remove the files */

      log_event(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_FILE,
        pjob->ji_qs.ji_jobid,
        "unable to remove checkpoint file for job");
      }
    }

  return;
  }  /* END remove_checkpoint() */




/*
 * cleanup_restart_file() - request that mom cleanup checkpoint restart file for
 * a job. used when the job has completed or put on hold or deleted
 */

void cleanup_restart_file(

  job *pjob)  /* I */

  {
    /* checkpoint restart file cleanup was successful */

/*    pjob->ji_qs.ji_svrflags |= JOB_SVFLG_CHECKPOINT_COPIED; */

    /* clear restart_name attribute since purging job will clean it up */

    pjob->ji_wattr[JOB_ATR_restart_name].at_flags &= ~ATR_VFLAG_SET;
    pjob->ji_modified = 1;

    job_save(pjob, SAVEJOB_FULL, 0);

  return;
  }  /* END cleanup_restart_file() */





int record_jobinfo(
    
  job *pjob)

  {
  attribute              *pattr;
  int                     i;
  int                     rc;
  dynamic_string         *buffer;
  char                    job_script_buf[(MAXPATHLEN << 4) + 1];
  char                    namebuf[MAXPATHLEN + 1];
  int                     fd;
  size_t                  bytes_read = 0;
  extern pthread_mutex_t *job_log_mutex; 
  long                    record_job_script = FALSE;
  
  pthread_mutex_lock(job_log_mutex);
  if ((rc = job_log_open(job_log_file, path_jobinfo_log)) < 0)
    {
    pthread_mutex_unlock(job_log_mutex);
    log_err(rc, __func__, "Could not open job log ");
    return(rc);
    }
  pthread_mutex_unlock(job_log_mutex);

  if ((buffer = get_dynamic_string(MAXLINE << 3, NULL)) == NULL)
    {
    log_err(ENOMEM, __func__, "Can't allocate memory");
    return(-1);
    }
 
  append_dynamic_string(buffer, "<Jobinfo>\n");
  append_dynamic_string(buffer, "\t<Job_Id>");
  append_dynamic_string(buffer, pjob->ji_qs.ji_jobid);
  append_dynamic_string(buffer, "</JobId>");
  
  if ((rc = log_job_record(buffer->str)) != PBSE_NONE)
    {
    log_err(rc, __func__, "log_job_record failed");
    free_dynamic_string(buffer);
    return(rc);
    }

  for (i = 0; i < JOB_ATR_LAST; i++)
    {
    clear_dynamic_string(buffer);

    pattr = &(pjob->ji_wattr[i]);

    if (pattr->at_flags & ATR_VFLAG_SET)
      {
      if (!strcmp(job_attr_def[i].at_name, "depend"))
        {
        /* we don't want this attribute in our log -
           The dependecies will show on the submit_args attribute */
        continue;
        }
      
      append_dynamic_string(buffer, "\t<");
      append_dynamic_string(buffer, job_attr_def[i].at_name);
      append_dynamic_string(buffer, ">");

      if (pattr->at_type == ATR_TYPE_RESC)
        append_dynamic_string(buffer, "\n");
      
      rc = attr_to_str(buffer, job_attr_def+i, pjob->ji_wattr[i], 1);
      
      if (pattr->at_type == ATR_TYPE_RESC)
        append_dynamic_string(buffer, "\t");

      append_dynamic_string(buffer, "</");
      append_dynamic_string(buffer, job_attr_def[i].at_name);
      append_dynamic_string(buffer, ">");

      if ((rc = log_job_record(buffer->str)) != PBSE_NONE)
        {
        log_err(rc, __func__, "log_job_record failed recording attributes");
        free_dynamic_string(buffer);
        return(rc);
        }
      }
    }
  
  get_svr_attr_l(SRV_ATR_RecordJobScript, &record_job_script);
  if (record_job_script)
    {
    /* This is for Baylor. We will make it a server parameter eventually
     * Write the contents of the script to our log file*/
    
    append_dynamic_string(buffer, "\t<job_script>");
    
    snprintf(namebuf, sizeof(namebuf), "%s%s%s",
      path_jobs, pjob->ji_qs.ji_fileprefix, JOB_SCRIPT_SUFFIX);
    
    if ((fd = open(namebuf, O_RDONLY)) > 0)
      {
      while ((bytes_read = read(fd, job_script_buf, sizeof(job_script_buf))) > 0)
        {
        job_script_buf[bytes_read] = '\0';
        
        rc = append_dynamic_string(buffer, job_script_buf);
        }

      close(fd);
      }
    else
      {
      append_dynamic_string(buffer, "unable to open script file\n");
      }
   
    append_dynamic_string(buffer, "\t</job_script>\n");
    
    if ((rc = log_job_record(buffer->str)) != PBSE_NONE)
      {
      free_dynamic_string(buffer);
      log_err(rc, __func__, "log_job_record failed");
      return(rc);
      }
    }
  
  clear_dynamic_string(buffer);

  if ((rc = append_dynamic_string(buffer, "</Jobinfo>\n")) != PBSE_NONE)
    {
    log_err(rc, __func__, "");
    free_dynamic_string(buffer);
    return(rc);
    }

  rc = log_job_record(buffer->str);
      
  free_dynamic_string(buffer);
  return(rc);
  } /* END record_jobinfo() */



/*
 * job_purge - purge job from system
 *
 * The job is dequeued; the job control file, script file and any spooled
 * output files are unlinked, and the job structure is freed.
 * If we are MOM, the task files and checkpoint files are also
 * removed.
 */

void job_purge(

  job *pjob)  /* I (modified) */

  {
  static char   id[] = "job_purge";

  char          log_buf[LOCAL_LOG_BUF_SIZE];
  char          namebuf[MAXPATHLEN + 1];
  extern char  *msg_err_purgejob;
  time_t        time_now = time(NULL);
  long          record_job_info = FALSE;

  /* check to see if we are keeping a log of all jobs completed */
  get_svr_attr_l(SRV_ATR_RecordJobInfo, &record_job_info);
  if (record_job_info)
    {
    record_jobinfo(pjob);

    /* Start a task to monitor job log roll over if it is not already started */
    if (check_job_log_started == 0)
      {
      set_task(WORK_Timed, time_now + 10, check_job_log, NULL, FALSE);

      check_job_log_started = 1;
      }
    }
    

  if ((pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRANSIN) &&
      (pjob->ji_qs.ji_substate != JOB_SUBSTATE_TRANSICM))
    {
    svr_dequejob(pjob, FALSE);

    if (strlen(pjob->ji_qs.ji_jobid) == 0)
      return;
    }

  /* if part of job array then remove from array's job list */
  if ((pjob->ji_arraystruct) != NULL &&
      (pjob->ji_is_array_template == FALSE))
    {
    /* pa->ai_mutex will come out locked after 
       the call to get_jobs_array */
    job_array *pa = get_jobs_array(&pjob);

    if (pjob != NULL)
      {
      /* erase the pointer to this job in the job array */
      free(pa->job_ids[pjob->ji_wattr[JOB_ATR_job_array_id].at_val.at_long]);
      pa->job_ids[pjob->ji_wattr[JOB_ATR_job_array_id].at_val.at_long] = NULL;
      
      /* if there are no more jobs in the arry,
       * then we can clean that up too */
      pa->ai_qs.num_purged++;
      if (pa->ai_qs.num_purged == pa->ai_qs.num_jobs)
        {
        /* array_delete will unlock pa->ai_mutex */
        array_delete(pa);
        }
      else
        {
        array_save(pa);
        
        pthread_mutex_unlock(pa->ai_mutex);
        if (LOGLEVEL >=7)
          {
          sprintf(log_buf, "unlocked ai_mutex: %s", id);
          log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
          }
        }
      }
    else
      return;
    }

  if ((pjob->ji_is_array_template == TRUE) ||
      (pjob->ji_arraystruct == NULL))
    remove_job(&array_summary,pjob);

  /* delete the script file */
  if ((pjob->ji_arraystruct == NULL) ||
      (pjob->ji_is_array_template == TRUE))
    {
    /* delete script file */        
    snprintf(namebuf, sizeof(namebuf), "%s%s%s", path_jobs, pjob->ji_qs.ji_fileprefix, JOB_SCRIPT_SUFFIX);

    if (unlink(namebuf) < 0)
      {
      if (errno != ENOENT)
        log_err(errno, id, msg_err_purgejob);
      }
    else if (LOGLEVEL >= 6)
      {
      sprintf(log_buf, "removed job script");

      log_record(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }
    }

  /* delete any spooled stdout */
  snprintf(namebuf, sizeof(namebuf), "%s%s%s", path_jobs, pjob->ji_qs.ji_fileprefix, JOB_STDOUT_SUFFIX);

  if (unlink(namebuf) < 0)
    {
    if (errno != ENOENT)
      log_err(errno, id, msg_err_purgejob);
    }
  else if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "removed job stdout");

    log_record(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
    }

  /* delete any spooled stderr */
  snprintf(namebuf, sizeof(namebuf), "%s%s%s", path_jobs, pjob->ji_qs.ji_fileprefix, JOB_STDERR_SUFFIX);

  if (unlink(namebuf) < 0)
    {
    if (errno != ENOENT)
      log_err(errno, id, msg_err_purgejob);
    }
  else if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "removed job stderr");

    log_record(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
    }

  /* remove checkpoint restart file if there is one */
  if (pjob->ji_wattr[JOB_ATR_restart_name].at_flags & ATR_VFLAG_SET)
    {
    cleanup_restart_file(pjob);
    }

  /* delete checkpoint file directory if there is one */
  if (pjob->ji_wattr[JOB_ATR_checkpoint_name].at_flags & ATR_VFLAG_SET)
    {
    snprintf(namebuf, sizeof(namebuf), "%s%s%s", path_checkpoint, pjob->ji_qs.ji_fileprefix, JOB_CHECKPOINT_SUFFIX);

    if (remtree(namebuf) < 0)
      {
      if (errno != ENOENT)
        log_err(errno, id, msg_err_purgejob);
      }
    else if (LOGLEVEL >= 6)
      {
      sprintf(log_buf, "removed job checkpoint");

      log_record(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
      }
    }

  if (pjob->ji_is_array_template == TRUE)
    {
    snprintf(namebuf, sizeof(namebuf), "%s%s%s", path_jobs, pjob->ji_qs.ji_fileprefix, JOB_FILE_TMP_SUFFIX);
    }
  else
    {
    snprintf(namebuf, sizeof(namebuf), "%s%s%s", path_jobs, pjob->ji_qs.ji_fileprefix, JOB_FILE_SUFFIX);
    }

  if (unlink(namebuf) < 0)
    {
    if (errno != ENOENT)
      log_err(errno, id, msg_err_purgejob);
    }
  else if (LOGLEVEL >= 6)
    {
    sprintf(log_buf, "removed job file");

    log_record(PBSEVENT_DEBUG,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);
    }

  job_free(pjob);

  return;
  }  /* END job_purge() */




/*
 * get_correct_jobname() - makes sure the job searches for the correct name
 * necessary because of SRV_ATR_display_job_server_suffix and
 * SRV_ATR_job_suffix_alias
 *
 * allocs the correct job name
 * @param jobid (I) - the jobid as passed in (NUM.SERVER_NAME)
 * @return a pointer to the correct job name (alloc'd)
 */
char *get_correct_jobname(

  const char *jobid) /* I */

  {
  char *correct = NULL;
  char *dot;
  /* first suffix could be the server name or the alias */
  char *first_suffix = NULL;

  /* second suffix can only be the alias */
  char *second_suffix = NULL;
  int   server_suffix = TRUE;

  int len;

  char *id = "get_correct_jobname";
  long  display_suffix = TRUE;
  char *alias = NULL;

  get_svr_attr_l(SRV_ATR_display_job_server_suffix, &display_suffix);
  if (display_suffix == FALSE)
    server_suffix = FALSE;

  if ((dot = strchr(jobid,'.')) != NULL)
    {
    first_suffix = dot + 1;

    if ((dot = strchr(first_suffix,'.')) != NULL)
      {
      second_suffix = dot + 1;
      }
    }

  dot = NULL;

  /* check current settings */
  get_svr_attr_str(SRV_ATR_job_suffix_alias, &alias);
  if ((alias != NULL) &&
      (server_suffix == TRUE))
    {
    /* display the server suffix and the alias */

    /* check if alias is already there */
    if (second_suffix != NULL)
      {
      if (strcmp(second_suffix,alias) == 0)
        {
        correct = strdup(jobid);

        if (correct == NULL)
          log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");

        return(correct);
        }
      }
    else if (first_suffix == NULL)
      {
      /* alloc memory and sprint, add 3 for 2 '.' and NULL terminator */
      len = strlen(jobid) + strlen(server_name) + strlen(alias) + 3;
      correct = calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");
        return(NULL);
        }

      snprintf(correct,len,"%s.%s.%s",
        jobid,server_name,alias);
      }
    else
      {
      /* add 2 for null terminator and '.' */
      len = strlen(alias) + 2 + strlen(jobid);

      correct = calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");
        return(NULL);
        }

      snprintf(correct,len,"%s.%s",jobid,alias);
      }
    } /* END if (server_suffix && alias) */
  else if (server_suffix == TRUE)
    {
    /* just the server suffix */

    /* check for the server suffix */
    if (second_suffix != NULL)
      {
      dot = second_suffix - 1;
      *dot = '\0';

      len = strlen(jobid) + 1 ;

      correct = calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");
        return(NULL);
        }

      snprintf(correct,len,"%s",jobid);
      *dot = '.';
      }
    else if (first_suffix != NULL)
      {
      correct = strdup(jobid);

      if (correct == NULL)
        {
        log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");
        return(NULL);
        }
      }
    else
      {
      len = strlen(jobid) + strlen(server_name) + 2;

      correct = calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");
        return(NULL);
        }

      snprintf(correct,len,"%s.%s",
        jobid,server_name);
      }
    } /* END if (just server_suffix) */
  else if (alias != NULL)
    {
    /* just the alias, not the server */

    if (first_suffix == NULL)
      {
      len = strlen(jobid) + strlen(alias) + 2;

      correct = calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");
        return(NULL);
        }

      snprintf(correct,len,"%s.%s",jobid,alias);
      }
    else
      {
      len = strlen(alias) + 2;

      dot = first_suffix - 1;
      *dot = '\0';

      len += strlen(jobid);
      correct = calloc(1, len);

      if (correct == NULL)
        {
        log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");
        return(NULL);
        }

      snprintf(correct,len,"%s.%s",
        jobid,
        alias);

      *dot = '.';
      }
    } /* END else if (just alias) */
  else
    {
    /* no server suffix nor alias */
    if (first_suffix != NULL)
      {
      dot = first_suffix - 1;
      *dot = '\0';
      }

    len = strlen(jobid) + 1;
    correct = calloc(1, len);

    if (correct == NULL)
      {
      log_err(-1,id,"ERROR:    Fatal - Cannot allocate memory\n");
      return(NULL);
      }

    snprintf(correct,len,"%s",jobid);

    if (first_suffix != NULL)
      *dot = '.';
    }

  return(correct);
  } /* END get_correct_jobname() */



/*
 * searches the regular job list for the job
 */

job *find_job_regular_jobs(

  char *searchable_jobid)

  {
  job *pj = NULL;
  int  i;

  pthread_mutex_lock(alljobs.alljobs_mutex);
  
  i = get_value_hash(alljobs.ht, searchable_jobid);
  
  if (i >= 0)
    pj = (job *)alljobs.ra->slots[i].item;
  
  pthread_mutex_unlock(alljobs.alljobs_mutex);
  
  if (pj != NULL)
    {
    pthread_mutex_lock(pj->ji_mutex);

    if (pj->ji_being_recycled == TRUE)
      {
      pthread_mutex_unlock(pj->ji_mutex);
      pj = NULL;
      }
    }

  return(pj);
  } /* END find_job_regular_jobs() */



/* 
 * searches the array list for the job
 */ 

job *find_job_array_jobs(

  char *searchable_jobid)

  {
  job *pj = NULL;
  int  i;

  pthread_mutex_lock(array_summary.alljobs_mutex);
  
  i = get_value_hash(array_summary.ht, searchable_jobid);
  
  if (i >= 0)
    pj = (job *)array_summary.ra->slots[i].item;
  
  pthread_mutex_unlock(array_summary.alljobs_mutex);
  
  if (pj != NULL)
    {
    pthread_mutex_lock(pj->ji_mutex);

    if (pj->ji_being_recycled == TRUE)
      {
      pthread_mutex_unlock(pj->ji_mutex);
      pj = NULL;
      }
    }

  return(pj);
  } /* END find_job_array_jobs() */





/*
 * find_job() - find job by jobid
 *
 * Search list of all server jobs for one with same job id
 * Return NULL if not found or pointer to job struct if found
 */

job *find_job(

  char *jobid)

  {
  char *at;
  char *comp;
  int   different = FALSE;

  job  *pj = NULL;

  if ((at = strchr(jobid, (int)'@')) != NULL)
    * at = '\0'; /* strip off @server_name */

  if ((is_svr_attr_set(SRV_ATR_display_job_server_suffix)) ||
      (is_svr_attr_set(SRV_ATR_job_suffix_alias)))
    {
    comp = get_correct_jobname(jobid);
    different = TRUE;

    if (comp == NULL)
      return(NULL);
    }
  else
    {
    comp = jobid;
    }

  if (strstr(jobid,"[]") == NULL)
    {
    pj = find_job_regular_jobs(comp);

    /* when remotely routing jobs, they are removed from the 
     * regular job list first and the array summary after. 
     * Attempt to find them there if NULL */
    if (pj == NULL)
      pj = find_job_array_jobs(comp);
    } /* END if (not an array template job) */
  else
    {
    pj = find_job_array_jobs(comp);
    } /* END if (job is array template) */

  if (at)
    *at = '@'; /* restore @server_name */

  if (different)
    free(comp);

  return(pj);  /* may be NULL */
  }   /* END find_job() */




/* initializes the all_jobs array */
void initialize_all_jobs_array(
    
  struct all_jobs *aj)

  {
  aj->ra = initialize_resizable_array(INITIAL_JOB_SIZE);
  aj->ht = create_hash(INITIAL_HASH_SIZE);

  aj->alljobs_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(aj->alljobs_mutex,NULL);
  } /* END initialize_all_jobs_array() */




/*
 * insert a new job into the array
 *
 * @param pjob - the job to be inserted
 * @return PBSE_NONE on success 
 */
int insert_job(
    
  struct all_jobs *aj, 
  job             *pjob)

  {
  int           rc;

  pthread_mutex_lock(aj->alljobs_mutex);

  if ((rc = insert_thing(aj->ra,pjob)) == -1)
    {
    rc = ENOMEM;
    log_err(rc, __func__, "No memory to resize the array...SYSTEM FAILURE\n");
    }
  else
    {
    add_hash(aj->ht, rc, pjob->ji_qs.ji_jobid);

    rc = PBSE_NONE;
    }

  pthread_mutex_unlock(aj->alljobs_mutex);

  return(rc);
  } /* END insert_job() */





/*
 * insert a new job into the array after a previous one
 *
 * @param already_in - the job this job should follow 
 * @param pjob - the job to be inserted
 * @return PBSE_NONE if the job is inserted correctly
 */
int insert_job_after(

  struct all_jobs *aj,
  job             *already_in,
  job             *pjob)

  {
  int rc;
  int i;

  pthread_mutex_lock(aj->alljobs_mutex);

  i = get_value_hash(aj->ht,already_in->ji_qs.ji_jobid);
  
  if (i < 0)
    rc = THING_NOT_FOUND;
  else
    {
    if ((rc = insert_thing_after(aj->ra,pjob,i)) == -1)
      {
      rc = ENOMEM;
      log_err(rc, __func__, "No memory to resize the array...SYSTEM FAILURE");
      }
    else
      {
      add_hash(aj->ht,rc,pjob->ji_qs.ji_jobid);
      rc = PBSE_NONE;
      }
    }

  pthread_mutex_unlock(aj->alljobs_mutex);

  return(rc);
  } /* END insert_job_after() */




int insert_job_after_index(

  struct all_jobs *aj,
  int              index,
  job             *pjob)

  {
  int rc;

  pthread_mutex_lock(aj->alljobs_mutex);

  if ((rc = insert_thing_after(aj->ra, pjob, index)) == -1)
    {
    rc = ENOMEM;
    log_err(rc, __func__, "No memory to resize the array...SYSTEM FAILURE");
    }
  else
    {
    add_hash(aj->ht, rc, pjob->ji_qs.ji_jobid);
    rc = PBSE_NONE;
    }

  pthread_mutex_unlock(aj->alljobs_mutex);

  return(rc);
  } /* END insert_job_after_index() */





/*
 */
int insert_job_first(

  struct all_jobs *aj,
  job             *pjob)

  {
  static char *id = "insert_job_first";
  int          rc;

  pthread_mutex_lock(aj->alljobs_mutex);

  if ((rc = insert_thing_after(aj->ra,pjob,ALWAYS_EMPTY_INDEX)) == -1)
    {
    rc = ENOMEM;
    log_err(rc,id,"No memory to resize the array...SYSTEM FAILURE");
    }
  else
    {
    add_hash(aj->ht,rc,pjob->ji_qs.ji_jobid);
    rc = PBSE_NONE;
    }

  pthread_mutex_unlock(aj->alljobs_mutex);

  return(rc);
  } /* END insert_job_first () */




/*
 * get the job's index in the array 
 */

int get_jobs_index(

  struct all_jobs *aj,
  job             *pjob)

  {
  int  index;

  if (pthread_mutex_trylock(aj->alljobs_mutex))
    {
    pthread_mutex_unlock(pjob->ji_mutex);
    pthread_mutex_lock(aj->alljobs_mutex);
    pthread_mutex_lock(pjob->ji_mutex);
    }

  index = get_value_hash(aj->ht, pjob->ji_qs.ji_jobid);
  pthread_mutex_unlock(aj->alljobs_mutex);

  return(index);
  } /* END get_jobs_index() */




/*
 * check if an object is in the all_jobs object
 */

int has_job(

  struct all_jobs *aj,
  job             *pjob)

  {
  int rc;

  pthread_mutex_lock(aj->alljobs_mutex);

  if (get_value_hash(aj->ht,pjob->ji_qs.ji_jobid) < 0)
    rc = FALSE;
  else
    rc = TRUE;

  pthread_mutex_unlock(aj->alljobs_mutex);

  return(rc);
  } /* END has_job() */





/* 
 * remove a job from the array
 *
 * @param pjob - the job to remove
 * @return PBSE_NONE if the job is removed 
 */

int  remove_job(
   
  struct all_jobs *aj, 
  job             *pjob)

  {
  int rc = PBSE_NONE;
  int index;

  if (pthread_mutex_trylock(aj->alljobs_mutex))
    {
    pthread_mutex_unlock(pjob->ji_mutex);
    pthread_mutex_lock(aj->alljobs_mutex);
    pthread_mutex_lock(pjob->ji_mutex);
    }

  if ((index = get_value_hash(aj->ht,pjob->ji_qs.ji_jobid)) < 0)
    rc = THING_NOT_FOUND;
  else
    {
    remove_thing_from_index(aj->ra,index);
    remove_hash(aj->ht,pjob->ji_qs.ji_jobid);
    }

  pthread_mutex_unlock(aj->alljobs_mutex);

  return(rc);
  } /* END remove_job() */





job *next_job(

  struct all_jobs *aj,
  int             *iter)

  {
  job *pjob;

  pthread_mutex_lock(aj->alljobs_mutex);

  pjob = (job *)next_thing(aj->ra,iter);

  pthread_mutex_unlock(aj->alljobs_mutex);

  if (pjob != NULL)
    {
    pthread_mutex_lock(pjob->ji_mutex);

    if (pjob->ji_being_recycled == TRUE)
      {
      pthread_mutex_unlock(pjob->ji_mutex);

      pjob = next_job(aj,iter);
      }
    }

  return(pjob);
  } /* END next_job() */





job *next_job_from_back(

  struct all_jobs *aj,
  int             *iter)

  {
  job *pjob;

  pthread_mutex_lock(aj->alljobs_mutex);

  pjob = (job *)next_thing_from_back(aj->ra,iter);

  pthread_mutex_unlock(aj->alljobs_mutex);

  if (pjob != NULL)
    {
    pthread_mutex_lock(pjob->ji_mutex);

    if (pjob->ji_being_recycled == TRUE)
      {
      pthread_mutex_unlock(pjob->ji_mutex);

      pjob = next_job_from_back(aj,iter);
      }
    }

  return(pjob);
  } /* END next_job_from_back() */




/* currently this function can only be called for jobs in the alljobs array */
int swap_jobs(

  struct all_jobs *aj,
  job             *job1,
  job             *job2)

  {
  int rc;
  int new1;
  int new2;

  if (aj == NULL)
    {
    aj = &alljobs;
    }

  pthread_mutex_lock(aj->alljobs_mutex);

  new2 = get_value_hash(aj->ht,job1->ji_qs.ji_jobid);
  new1 = get_value_hash(aj->ht,job2->ji_qs.ji_jobid);

  if ((new1 == -1) ||
      (new2 == -1))
    {
    rc = THING_NOT_FOUND;
    }
  else
    {
    rc = swap_things(aj->ra,job1,job2);
    
    change_value_hash(aj->ht,job1->ji_qs.ji_jobid,new1);
    change_value_hash(aj->ht,job2->ji_qs.ji_jobid,new2);
    }

  pthread_mutex_unlock(aj->alljobs_mutex);
  
  return(rc);
  } /* END swap_jobs() */




/* 
 * Always access the array in this way in order to avoid deadlock 
 *
 * @return this job's array struct with it's mutex locked 
 */

job_array *get_jobs_array(

  job **pjob_ptr)

  {
  char       log_buf[LOCAL_LOG_BUF_SIZE];
  char       jobid[PBS_MAXSVRJOBID];
  job       *pjob = *pjob_ptr;
  job_array *pa = pjob->ji_arraystruct;

  if (pa != NULL)
    {
    if (pthread_mutex_trylock(pa->ai_mutex))
      {
      strcpy(jobid, pjob->ji_qs.ji_jobid);
      
      pthread_mutex_unlock(pjob->ji_mutex);
      if (LOGLEVEL >=7)
        {
        sprintf(log_buf, "locking ai_mutex: %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      pthread_mutex_lock(pa->ai_mutex);
      if (LOGLEVEL >=7)
        {
        sprintf(log_buf, "locked ai_mutex: %s", __func__);
        log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, log_buf);
        }
      
      if ((pjob = find_job(jobid)) == NULL)
        {
        pthread_mutex_unlock(pa->ai_mutex);
        pa = NULL;
        *pjob_ptr = NULL;
        }
      }
    }

  return(pa);
  } /* END get_jobs_array() */



pbs_queue *get_jobs_queue(

  job **pjob_ptr)

  {
  char       jobid[PBS_MAXSVRJOBID];
  job       *pjob = *pjob_ptr;
  pbs_queue *pque = pjob->ji_qhdr;

  if (pque != NULL)
    {
    if (pthread_mutex_trylock(pque->qu_mutex))
      {
      /* if fail */
      strcpy(jobid, pjob->ji_qs.ji_jobid);
      pthread_mutex_unlock(pjob->ji_mutex);
      lock_queue(pque, __func__, NULL, LOGLEVEL);

      if ((pjob = find_job(jobid)) == NULL)
        {
        unlock_queue(pque, __func__, NULL, 0);
        pque = NULL;
        *pjob_ptr = NULL;
        }
      }
    }

  return(pque);
  } /* END get_jobs_queue() */



/* END job_func.c */

