/*
*         OpenPBS (Portable Batch System) v2.3 Software License
*
* Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
* All rights reserved.
*
* ---------------------------------------------------------------------------
* For a license to use or redistribute the OpenPBS software under conditions
* other than those described below, or to purchase support for this software,
* please contact Veridian Systems, PBS Products Department ("Licensor") at:
*
*    www.OpenPBS.org  +1 650 967-4675                  sales@OpenPBS.org
*                        877 902-4PBS (US toll-free)
* ---------------------------------------------------------------------------
*
* This license covers use of the OpenPBS v2.3 software (the "Software") at
* your site or location, and, for certain users, redistribution of the
* Software to other sites and locations.  Use and redistribution of
* OpenPBS v2.3 in source and binary forms, with or without modification,
* are permitted provided that all of the following conditions are met.
* After December 31, 2001, only conditions 3-6 must be met:
*
* 3. Any Redistribution of source code must retain the above copyright notice
*    and the acknowledgment contained in paragraph 6, this list of conditions
*    and the disclaimer contained in paragraph 7.
*
* 4. Any Redistribution in binary form must reproduce the above copyright
*    notice and the acknowledgment contained in paragraph 6, this list of
*    conditions and the disclaimer contained in paragraph 7 in the
*    documentation and/or other materials provided with the distribution.
*
* 5. Redistributions in any form must be accompanied by information on how to
*    obtain complete source code for the OpenPBS software and any
*    modifications and/or additions to the OpenPBS software.  The source code
*    must either be included in the distribution or be available for no more
*    than the cost of distribution plus a nominal fee, and all modifications
*    and additions to the Software must be freely redistributable by any party
*    (including Licensor) without restriction.
*
* 6. All advertising materials mentioning features or use of the Software must
*    display the following acknowledgment:
*
*     "This product includes software developed by NASA Ames Research Center,
*     Lawrence Livermore National Laboratory, and Veridian Information
*     Solutions, Inc.
*     Visit www.OpenPBS.org for OpenPBS software support,
*     products, and information."
*
* 7. DISCLAIMER OF WARRANTY
*
* THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT
* ARE EXPRESSLY DISCLAIMED.
*
* IN NO EVENT SHALL VERIDIAN CORPORATION, ITS AFFILIATED COMPANIES, OR THE
* U.S. GOVERNMENT OR ANY OF ITS AGENCIES BE LIABLE FOR ANY DIRECT OR INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/
#include <pbs_config.h>   /* the master config generated by configure */
#include "pbsd_init.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
#include <memory.h>
#include <time.h>
#include <unistd.h>
#include <grp.h>
#include <semaphore.h>

#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <dirent.h>
#ifdef _CRAY
#include <sys/category.h>
#endif /* _CRAY */
#include <sys/time.h>
#include <sys/resource.h>

#include <pthread.h>

#include "pbs_ifl.h"
#include "log.h"
#include "../lib/Liblog/pbs_log.h"
#include "../lib/Liblog/log_event.h"
#include "../lib/Liblog/setup_env.h"
#include "../lib/Liblog/chk_file_sec.h"
#include "list_link.h"
#include "attribute.h"
#include "server_limits.h"
#include "server.h"
#include "pbs_job.h"
#include "resource.h"
#include "work_task.h"
#include "tracking.h"
#include "svrfunc.h"
#include "acct.h"
#include "rpp.h"
#include "net_connect.h"
#include "pbs_proto.h"
#include "batch_request.h"
#include "array.h"
#include "csv.h"
#include "pbs_nodes.h"
#include "threadpool.h"
#include "../lib/Libutils/u_lock_ctl.h" /* unlock_node */
#include "queue_recov.h" /* que_recov_xml */
#include "dynamic_string.h"
#include "utils.h"
#include "queue_recycler.h" /* queue_recycler */
#include "svr_task.h" /* initialize_task_recycler */
#include "svr_func.h" /* get_svr_attr_* */
#include "login_nodes.h"
#include "track_alps_reservations.h"
#include "job_func.h" /* job_purge */
#include "net_cache.h"

/*#ifndef SIGKILL*/
/* there is some weird stuff in gcc include files signal.h & sys/params.h */
#include <signal.h>
/*#endif*/

#ifndef TRUE
#define TRUE 1
#endif /* TRUE */

#ifndef FALSE
#define FALSE 0
#endif /* FALSE */

/* global Data Items */

struct addrinfo hints;
extern char *msg_daemonname;
extern char *msg_init_abt;
extern char *msg_init_queued;
extern char *msg_init_substate;
extern char *msg_err_noqueue;
extern char *msg_err_malloc;
extern char *msg_init_noqueues;
extern char *msg_init_recovque;
extern char *msg_init_expctq;
extern char *msg_init_nojobs;
extern char *msg_init_exptjobs;
extern char *msg_init_norerun;
extern char *msg_init_unkstate;
extern char *msg_init_baddb;
extern char *msg_init_chdir;
extern char *msg_init_badjob;
extern char *msg_script_open;

extern char *acct_file;
extern char *log_file;
extern char *job_log_file;
extern char *path_home;
extern char *path_acct;
extern char  path_log[];
extern char *path_priv;
extern char *path_arrays;
extern char *path_jobs;
extern char *path_credentials;
extern char *path_queues;
extern char *path_spool;
extern char *path_svrdb;
extern char *path_svrdb_new;
extern char *path_svrlog;
extern char *path_track;
extern char *path_nodes;
extern char *path_mom_hierarchy;
extern char *path_nodes_new;
extern char *path_nodestate;
extern char *path_nodenote;
extern char *path_nodenote_new;
extern char *path_checkpoint;
extern char *path_jobinfo_log;


extern int              queue_rank;
extern char             server_name[];
extern tlist_head       svr_newnodes;
extern all_tasks        task_list_timed;
extern all_tasks        task_list_event;
task_recycler           tr;
extern struct all_jobs  alljobs;
extern struct all_jobs  array_summary;
extern struct all_jobs  newjobs;
all_queues              svr_queues;
job_recycler            recycler;
queue_recycler          q_recycler;

dynamic_string         *hierarchy_holder;
hello_container         hellos;
hello_container         failures;

reservation_holder      alps_reservations;
batch_request_holder    brh;

extern pthread_mutex_t *acctfile_mutex;
pthread_mutex_t        *scheduler_sock_jobct_mutex;
extern int              scheduler_sock;
extern int              scheduler_jobct;
extern pthread_mutex_t *svr_do_schedule_mutex;
extern pthread_mutex_t *listener_command_mutex;
extern pthread_mutex_t *node_state_mutex;
extern pthread_mutex_t *check_tasks_mutex;

extern int a_opt_init;

extern int LOGLEVEL;
extern char *plogenv;

extern struct server server;

/* External Functions Called */

extern void   on_job_rerun(struct work_task *);
extern void   set_resc_assigned(job *, enum batch_op);
extern void   set_old_nodes(job *);
extern void   acct_close(void);

extern struct work_task *apply_job_delete_nanny(struct job *, int);
extern int     net_move(job *, struct batch_request *);
void          on_job_exit(struct work_task *);

/* Private functions in this file */

void  init_abt_job(job *);
char *build_path(char *, char *, char *);
void  catch_abort(int);
void  change_logs(int);
void  change_log_level(int);
int   chk_save_file(char *);
int   pbsd_init_job(job *, int);
void  pbsd_init_reque(job *, int);
void  resume_net_move(struct work_task *);
void  rm_files(char *);
void  stop_me(int);

/* private data */

#define CHANGE_STATE 1
#define KEEP_STATE   0

/**
 * Initialize a dynamic array to a specific size
 * @param Array (O) Assumed to be uninitialized struct
 * @param InitialSize (I) raised to 0 if less than 0
 */

int DArrayInit(

  darray_t *Array,      /* I */
  int       InitialSize) /* I */

  {
  if (InitialSize <= 0)
    {
    Array->Length = 0;
    Array->Data = NULL;
    }
  else
    {
    Array->Length = InitialSize;
    Array->Data = (void **)calloc(sizeof(Array->Data[0]), InitialSize);

    if (Array->Data == NULL)
      return(FAILURE);
    }

  Array->AppendIndex = 0;
  return(SUCCESS);
  } /*END DArrayInit */



/**
 * Free the resources associated with Array
 * It does NOT free any data stored in the array, just the array structure itself.
 * param Array (I)
 */

int DArrayFree(

  darray_t *Array) /* I */

  {
  free(Array->Data);
  Array->Data = NULL;
  Array->Length = 0;
  Array->AppendIndex = 0;
  return(SUCCESS);
  } /*END DArrayFree */



/**
 * Append Item onto the end of Array, resizing it if necessary
 * @param Array (I/O)
 * @param Item (I)
 */

int DArrayAppend(

  darray_t *Array, /* I/O */
  void     *Item)  /* I */

  {
  void *tmp = NULL;

  if (Array->AppendIndex >= Array->Length)
    {
    int newLength = Array->Length * 2;

    if (newLength <= 10)
      newLength = 10;

    tmp = calloc(newLength, sizeof(Array->Data[0]));

    if (tmp == NULL)
      {
      free(Array->Data);
      Array->Length = 0;
      Array->AppendIndex = 0;
      return(FAILURE);
      }

    memcpy(tmp, Array->Data, sizeof(Array->Data[0]) * Array->Length);
    free(Array->Data);
    Array->Data = tmp;
    Array->Length = newLength;
    }

  Array->Data[Array->AppendIndex++] = Item;
  return(SUCCESS);
  } /* END DArrayAppend */



/**
 * Sort two job structs by their priority in ascending order
 * @param A (I)
 * @param B (I)
 */

int SortPrioAscend(

  const void *A, /* I */
  const void *B) /* I */

  {
  job *pjob1 = *((job **)A);
  job *pjob2 = *((job **)B);
  int prio1 = pjob1->ji_wattr[JOB_ATR_qrank].at_val.at_long;
  int prio2 = pjob2->ji_wattr[JOB_ATR_qrank].at_val.at_long;

  return(prio1 - prio2);
  } /*END SortPrioAscend */


void  update_default_np()
  
  {
  struct pbsnode *pnode;
  int             iter = -1;
  long            default_np = 0;
  long            npfreediff;

  get_svr_attr_l(SRV_ATR_NPDefault, &default_np);

  if (default_np > 0)
    {
    while ((pnode = next_host(&allnodes,&iter,NULL)) != NULL)
      {
      npfreediff = pnode->nd_nsn - pnode->nd_nsnfree;
      pnode->nd_nsn = default_np;
      pnode->nd_nsnfree = default_np - npfreediff;
      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      }
    }

  return;
  } /* END update_default_np() */

/* Add the server names from /var/spool/torque/server_name to the trusted hosts list. */

void add_server_names_to_acl_hosts(void)

  {
  int            n; 
  int            list_len; 
  int            rc;

  char          *server_list_ptr;
  char          *tp;
  char           buffer[PBS_MAXSERVERNAME+1];
  pbs_attribute  temp;

  pbs_attribute *patr = &server.sv_attr[SRV_ATR_acl_hosts];

  memset(buffer, 0, PBS_MAXSERVERNAME+1);
  memset(&temp, 0, sizeof(pbs_attribute));

  server_list_ptr = pbs_get_server_list();
  list_len = csv_length(server_list_ptr);

  for (n = 0; n < list_len; n++)
    {
    tp = csv_nth(server_list_ptr, n);

    if (tp)
      {
      snprintf(buffer, sizeof(buffer), "%s", tp);

      if ((tp = strchr(buffer, ':')))  /* Don't include any port specification */
        *tp = 0;

      if ((rc = decode_arst_direct(&temp, buffer)) != 0)
        {
        return;
        }

      set_arst(patr, &temp, DECR); /* First make sure that the strings are not there. */

      set_arst(patr, &temp, INCR);
      free_arst(&temp);
      }
    }

  return;
  }




dynamic_string *make_default_hierarchy() 

  {
  struct pbsnode *pnode;
  dynamic_string *default_hierarchy;
  dynamic_string *level_ds;
  int             iter = -1;
  char            buf[MAXLINE];

  if (((default_hierarchy = get_dynamic_string(-1, NULL)) == NULL) ||
      ((level_ds = get_dynamic_string(-1, NULL)) == NULL))
    {
    log_err(ENOMEM, __func__, "Cannot allocate memory");
    return(NULL);
    }

  copy_to_end_of_dynamic_string(default_hierarchy, "<sp>");
  copy_to_end_of_dynamic_string(default_hierarchy, "<sl>");

  while ((pnode = next_host(&allnodes, &iter, NULL)) != NULL)
    {
    if (level_ds->used > 0)
      append_dynamic_string(level_ds, ",");

    append_dynamic_string(level_ds, pnode->nd_name);

    if (PBS_MANAGER_SERVICE_PORT != pnode->nd_mom_rm_port)
      {
      snprintf(buf, sizeof(buf), ":%d", (int)pnode->nd_mom_rm_port);
      append_dynamic_string(level_ds, buf);
      }

    pnode->nd_hierarchy_level = 0;

    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  copy_to_end_of_dynamic_string(default_hierarchy, level_ds->str);
  copy_to_end_of_dynamic_string(default_hierarchy, "</sl>");
  copy_to_end_of_dynamic_string(default_hierarchy, "</sp>");

  free_dynamic_string(level_ds);

  return(default_hierarchy);
  } /* END make_default_hierarchy() */





int can_resolve_hostname(

  char *hostname)

  {
  char            *colon;
  struct addrinfo *addr_info;
  int              can_resolve = FALSE;

  if ((colon = strchr(hostname, ':')) != NULL)
    *colon = '\0';

  if (get_cached_addrinfo(hostname) != NULL)
    can_resolve = TRUE;
  else if (getaddrinfo(hostname, NULL, NULL, &addr_info) == 0)
    {
    struct sockaddr_in *sai = (struct sockaddr_in *)addr_info->ai_addr;
    can_resolve = TRUE;
    insert_addr_name_info(hostname, addr_info->ai_canonname, sai);
    freeaddrinfo(addr_info);
    }

  if (colon != NULL)
    *colon = ':';

  return(can_resolve);
  } /* END can_resolve_hostname() */





void check_if_in_nodes_file(

  char *hostname,
  int   level_index)

  {
  char                log_buf[LOCAL_LOG_BUF_SIZE];
  struct pbsnode     *pnode;
  char               *colon;
  struct addrinfo    *addr_info;
  struct sockaddr_in *sai;
  unsigned long       ipaddr;

  if ((colon = strchr(hostname, ':')) != NULL)
    *colon = '\0';
  
  if ((pnode = find_nodebyname(hostname)) == NULL)
    {
    snprintf(log_buf, sizeof(log_buf), 
      "Node %s found in mom_hierarchy but not found in nodes file. Adding",
      hostname);
    log_err(-1, __func__, log_buf);

    if ((sai = get_cached_addrinfo(hostname)) == NULL)
      {
      getaddrinfo(hostname, NULL, NULL, &addr_info);
      sai = (struct sockaddr_in *)addr_info->ai_addr;
      ipaddr = ntohl(sai->sin_addr.s_addr);

      insert_addr_name_info(hostname, addr_info->ai_canonname, sai);

      freeaddrinfo(addr_info);
      }
    else
      ipaddr = ntohl(sai->sin_addr.s_addr);

    create_partial_pbs_node(hostname, ipaddr, ATR_DFLAG_MGRD | ATR_DFLAG_MGWR);
    pnode = find_nodebyname(hostname);
    }
    
  pnode->nd_in_hierarchy = TRUE;

  if (pnode->nd_hierarchy_level > level_index)
    pnode->nd_hierarchy_level = level_index;

  unlock_node(pnode, __func__, NULL, LOGLEVEL);

  if (colon != NULL)
    *colon = ':';
  } /* END check_if_in_nodes_file() */





int handle_level(
    
  char           *level_iter,
  dynamic_string *send_format,
  int             level_index)

  {
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  char           *delims = ",";
  char           *host_tok;
  dynamic_string *level_buf;

  if ((level_buf = get_dynamic_string(-1, NULL)) == NULL)
    {
    log_err(ENOMEM, __func__, "Cannot allocate memory");
    return(ENOMEM);
    }

  copy_to_end_of_dynamic_string(send_format, "<sl>");
      
  /* find each hostname */
  host_tok = threadsafe_tokenizer(&level_iter, delims);

  while (host_tok != NULL)
    {
    host_tok = trim(host_tok);

    if (can_resolve_hostname(host_tok) == FALSE)
      {
      snprintf(log_buf, sizeof(log_buf),
        "While parsing the mom hierarchy file, cannot resolve hostname %s",
        host_tok);
      log_err(-1, __func__, log_buf);
      }
    else
      {
      if (level_buf->used > 0)
        append_dynamic_string(level_buf, ",");

      check_if_in_nodes_file(host_tok, level_index);

      append_dynamic_string(level_buf, host_tok);
      }

    host_tok = threadsafe_tokenizer(&level_iter, delims);
    }
     
  copy_to_end_of_dynamic_string(send_format, level_buf->str);
  copy_to_end_of_dynamic_string(send_format, "</sl>");

  free_dynamic_string(level_buf);

  return(PBSE_NONE);
  } /* END handle_level() */




int handle_path(

  char           *path_iter,
  dynamic_string *send_format)

  {
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  char *level_parent;
  char *level_child;

  int   level_index = 0;

  copy_to_end_of_dynamic_string(send_format, "<sp>");
  
  /* iterate over each level in the path */
  while (get_parent_and_child(path_iter,&level_parent,&level_child,&path_iter) == PBSE_NONE)
    {
    if (!strncmp(level_parent,"level",strlen("level")))
      {
      handle_level(level_child, send_format, level_index);
  
      level_index++;
      }
    else
      {
      /* non-fatal error */
      snprintf(log_buf, sizeof(log_buf),
        "Found noise in the mom hierarchy file. Ignoring <%s>%s</%s>",
        level_parent, level_child, level_parent);
      log_err(-1, __func__, log_buf);
      }
    }
  
  if (level_index == 0)
    {
    /* empty level, delete the <sp> */
    delete_last_word_from_dynamic_string(send_format);
    }
  else
    {
    /* close path */
    copy_to_end_of_dynamic_string(send_format, "</sp>");
    }

  return(PBSE_NONE);
  } /* END handle_path() */




dynamic_string *parse_mom_hierarchy(
    
  int fds)

  {
  int             bytes_read;
  char            buffer[MAXLINE<<10];
  char           *current;
  char           *parent;
  char           *child;
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  struct pbsnode *pnode;
  int             iter = -1;
  unsigned char   first_missing_node = TRUE;
  dynamic_string *send_format = NULL;

  if ((bytes_read = read(fds, buffer, sizeof(buffer))) < 0)
    {
    snprintf(log_buf, sizeof(log_buf),
      "Unable to read from %s", path_mom_hierarchy);
    log_err(errno, __func__, log_buf);

    return(NULL);
    }
  
  if ((send_format = get_dynamic_string(-1, NULL)) == NULL)
    {
    log_err(ENOMEM, __func__, "Cannot allocate memory");
    return(NULL);
    }

  current = buffer;

  while (get_parent_and_child(current, &parent, &child, &current) == PBSE_NONE)
    {
    if (!strncmp(parent,"path",strlen("path")))
      {
      handle_path(child, send_format);
      }
    else
      {
      /* non-fatal error */
      snprintf(log_buf, sizeof(log_buf),
        "Found noise in the mom hierarchy file. Ignoring <%s>%s</%s>",
        parent, child, parent);
      log_err(-1, __func__, log_buf);
      }
    }

  if (send_format->used == 0)
    {
    /* if there were no valid paths, return NULL to signify an error */
    free_dynamic_string(send_format);
    send_format = NULL;
    }
  else
    {
    /* check if there are nodes that weren't in the hierarchy file that are in the nodes file */
    while ((pnode = next_host(&allnodes, &iter, NULL)) != NULL)
      {
      if (pnode->nd_in_hierarchy == FALSE)
        {
        if (first_missing_node == TRUE)
          {
          copy_to_end_of_dynamic_string(send_format, "<sp>");
          copy_to_end_of_dynamic_string(send_format, "<sl>");
          first_missing_node = FALSE;
          copy_to_end_of_dynamic_string(send_format, pnode->nd_name);
          }
        else
          {
          append_dynamic_string(send_format, ",");
          append_dynamic_string(send_format, pnode->nd_name);
          }

        snprintf(log_buf, sizeof(log_buf),
          "Node %s found in the nodes file but not in the mom_hierarchy file. Making it a level 1 node",
          pnode->nd_name);

        pnode->nd_hierarchy_level = 0;
        log_err( -1, __func__, log_buf);
        }

      unlock_node(pnode, __func__, NULL, LOGLEVEL);
      }

    if (first_missing_node == FALSE)
      {
      copy_to_end_of_dynamic_string(send_format, "</sl>");
      copy_to_end_of_dynamic_string(send_format, "</sp>");
      }
    }

  return(send_format);
  } /* END parse_mom_hierarchy() */





dynamic_string *prepare_mom_hierarchy()

  {
  char            log_buf[LOCAL_LOG_BUF_SIZE];
  int             fds;
  dynamic_string *send_format = NULL;

  if ((fds = open(path_mom_hierarchy, O_RDONLY, 0)) < 0)
    {
    if (errno == ENOENT)
      {
      /* Each node is a top level node */
      send_format = make_default_hierarchy();
      return(send_format);
      }

    snprintf(log_buf, sizeof(log_buf),
      "Unable to open %s", path_mom_hierarchy);
    log_err(errno, __func__, log_buf);
    }
  else if ((send_format = parse_mom_hierarchy(fds)) == NULL)
    {
    /* if there's an error, make a default hierarchy */
    send_format = make_default_hierarchy();
    }

  if (fds >= 0)
    close(fds);

  return(send_format);
  } /* END prepare_mom_hierarchy() */




int get_insertion_point(

  struct pbsnode *pnode,
  int            *indices)

  {
  int i;
  int level = pnode->nd_hierarchy_level;
  int insertion_point = 0;

  for (i = level - 1; i >= 0; i--)
    {
    if (indices[i] != 0)
      {
      insertion_point = indices[i];
      break;
      }
    }

  return(insertion_point);
  } /* END get_insertion_point() */




void add_all_nodes_to_hello_container()

  {
  struct pbsnode *pnode;
  int             iter = -1;
  int             level_indices[MAX_LEVEL_DEPTH];
  int             insertion_index;
  char           *node_name_dup;

  memset(level_indices, 0, sizeof(level_indices));

  while ((pnode = next_host(&allnodes, &iter, NULL)) != NULL)
    {
    if ((node_name_dup = strdup(pnode->nd_name)) != NULL)
      {
      /* make sure to insert things in order */
      if (level_indices[pnode->nd_hierarchy_level] == 0)
        {
        insertion_index = get_insertion_point(pnode, level_indices);
        level_indices[pnode->nd_hierarchy_level] = add_hello_after(&hellos, node_name_dup, insertion_index);
        }
      else
        add_hello_after(&hellos, node_name_dup, level_indices[pnode->nd_hierarchy_level]);
      }

    unlock_node(pnode, __func__, NULL, LOGLEVEL);
    }

  return;
  } /* END add_all_nodes_to_hello_container() */




int get_default_threads()

  {
  int   default_threads = DEFAULT_MIN_THREADS;
  int   count = 0;
  char  label[128];
  char  log_buf[LOCAL_LOG_BUF_SIZE];
  FILE *fp;

  if ((fp = fopen("/proc/cpuinfo", "r")) != NULL)
    {
    /* if we can determine the number of cores, make 
     * the default number of threads 2 * cores + 1 */
    while (!feof(fp))
      {
      if (fscanf(fp, "%s %*[^\n]%*c", label) == 0)
        {
        getc(fp);  /* must do something to get to eof */
        }
      else if (strcmp("processor", label) == 0)
        count++;
      }

    if (count > 0)
      default_threads = (2 * count) + 1;
    }

  snprintf(log_buf, sizeof(log_buf),
    "Defaulting min_threads to %d threads", default_threads);
  log_event(PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, log_buf);

  return(default_threads);
  } /* END get_default_threads() */



int setup_limits()

  {
#ifndef DEBUG
#ifndef _CRAY
  struct rlimit rlimit;
#endif
#endif

#ifndef DEBUG
#ifdef _CRAY
  limit(C_JOB,      0, L_CPROC, 0);
  limit(C_JOB,      0, L_CPU,   0);
  limit(C_JOBPROCS, 0, L_CPU,   0);
  limit(C_PROC,     0, L_FD,    255);
  limit(C_JOB,      0, L_FSBLK, 0);
  limit(C_JOBPROCS, 0, L_FSBLK, 0);
  limit(C_JOB,      0, L_MEM  , 0);
  limit(C_JOBPROCS, 0, L_MEM  , 0);
#else /* not  _CRAY */
  rlimit.rlim_cur = RLIM_INFINITY;
  rlimit.rlim_max = RLIM_INFINITY;
  setrlimit(RLIMIT_CPU,   &rlimit);
  setrlimit(RLIMIT_FSIZE, &rlimit);
  setrlimit(RLIMIT_DATA,  &rlimit);
  setrlimit(RLIMIT_STACK, &rlimit);
#ifdef RLIMIT_RSS
  setrlimit(RLIMIT_RSS,   &rlimit);
#endif /* RLIMIT_RSS */
#ifdef RLIMIT_VMEM
  setrlimit(RLIMIT_VMEM,  &rlimit);
#endif /* RLIMIT_VMEM */
#endif /* not _CRAY */
#endif /* DEBUG */
  
  return(PBSE_NONE);
  } /* END setup_limits() */




int setup_signal_handling()

  {
  struct sigaction  act;
  struct sigaction  oact;

  sigemptyset(&act.sa_mask);

  act.sa_flags   = 0;
  act.sa_handler = change_logs;

  if (sigaction(SIGHUP, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigaction for HUP");

    return(2);
    }

  act.sa_handler = stop_me;

  if (sigaction(SIGINT, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigaction for INT");

    return(2);
    }

  if (sigaction(SIGTERM, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigactin for TERM");

    return(2);
    }

#ifdef NDEBUG

  if (sigaction(SIGQUIT, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigactin for QUIT");

    return(2);
    }

#endif /* NDEBUG */

#ifdef SIGSHUTDN

  if (sigaction(SIGSHUTDN, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigactin for SHUTDN");

    return(2);
    }

#endif /* SIGSHUTDN */

  /*
   * Catch these signals to ensure we core dump even if
   * our rlimit for core dumps is set to 0 initially.
   *
   * Chris Samuel - VPAC
   * csamuel@vpac.org - 29th July 2003
   *
   * Now conditional on PBSCOREDUMP environment variable.
   * 13th August 2003.
   */

  if (getenv("PBSCOREDUMP"))
    {
    act.sa_handler = catch_abort;   /* make sure we core dump */

    sigaction(SIGSEGV, &act, NULL);
    sigaction(SIGBUS,  &act, NULL);
    sigaction(SIGFPE,  &act, NULL);
    sigaction(SIGILL,  &act, NULL);
    sigaction(SIGTRAP, &act, NULL);
    sigaction(SIGSYS,  &act, NULL);
    }

  act.sa_handler = SIG_DFL;

  if (sigaction(SIGCHLD, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigaction for CHLD");

    return(2);
    }

  act.sa_handler = SIG_IGN;

  if (sigaction(SIGPIPE, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigaction for PIPE");

    return(2);
    }

  act.sa_handler = change_log_level;

  if (sigaction(SIGUSR1, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigaction for USR1");

    return(2);
    }

  if (sigaction(SIGUSR2, &act, &oact) != 0)
    {
    log_err(errno, __func__, "sigaction for USR2");

    return(2);
    }

  return(PBSE_NONE);
  } /* END setup_signal_handling() */




int initialize_paths()

  {
  int          rc = PBSE_NONE;
  char        *suffix_slash = "/";
  char        *new_tag = ".new";
  struct stat  statbuf;
  char         log_buf[LOCAL_LOG_BUF_SIZE];
#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
  char         EMsg[1024];
#endif /* not DEBUG and not NO_SECURITY_CHECK */

  if (path_priv == NULL)
    path_priv        = build_path(path_home, PBS_SVR_PRIVATE, suffix_slash);

  path_arrays        = build_path(path_priv, PBS_ARRAYDIR, suffix_slash);
  path_spool         = build_path(path_home, PBS_SPOOLDIR, suffix_slash);
  path_queues        = build_path(path_priv, PBS_QUEDIR,   suffix_slash);
  path_jobs          = build_path(path_priv, PBS_JOBDIR,   suffix_slash);
  path_credentials   = build_path(path_priv, PBS_CREDENTIALDIR, suffix_slash);
  path_acct          = build_path(path_priv, PBS_ACCT,     suffix_slash);

  if (path_svrdb == NULL)
    path_svrdb       = build_path(path_priv, PBS_SERVERDB, NULL);

  path_svrdb_new     = build_path(path_priv, PBS_SERVERDB, new_tag);
  path_svrlog        = build_path(path_home, PBS_LOGFILES, suffix_slash);
  path_jobinfo_log   = build_path(path_home, PBS_JOBINFOLOGDIR, suffix_slash);
  path_track         = build_path(path_priv, PBS_TRACKING, NULL);
  path_nodes         = build_path(path_priv, NODE_DESCRIP, NULL);
  path_nodes_new     = build_path(path_priv, NODE_DESCRIP, new_tag);
  path_nodestate     = build_path(path_priv, NODE_STATUS,  NULL);
  path_nodenote      = build_path(path_priv, NODE_NOTE,    NULL);
  path_nodenote_new  = build_path(path_priv, NODE_NOTE, new_tag);
  path_mom_hierarchy = build_path(path_priv, PBS_MOM_HIERARCHY, NULL);

#ifdef SERVER_CHKPTDIR
  /* need to make sure path ends with a '/' */
  if (*(SERVER_CHKPTDIR + strlen(SERVER_CHKPTDIR) - 1)  == '/')
    {
    path_checkpoint  = strdup(SERVER_CHKPTDIR);
    }
  else
    {
    int len = strlen(SERVER_CHKPTDIR) + strlen(suffix_slash) + 1;
    path_checkpoint = calloc(1, len);
    snprintf(path_checkpoint, len, "%s%s", SERVER_CHKPTDIR, suffix_slash);
    }

#else
  path_checkpoint    = build_path(path_home, PBS_CHKPTDIR, suffix_slash);
#endif

  /* check existance amd make sure it is a directory */

  if (stat(path_checkpoint, &statbuf) < 0)
    {
    sprintf(log_buf,
      "unable to stat checkpoint directory %s, errno %d (%s)",
      path_checkpoint,
      errno,
      strerror(errno));
    log_err(errno, "pbs_init", log_buf);

    return(-1);
    }

  if (!S_ISDIR(statbuf.st_mode))
    {
    sprintf(log_buf, "checkpoint directory path %s is not a directory", path_checkpoint);
    log_err(errno, "pbs_init", log_buf);

    return(-1);
    }

#ifdef SERVER_CHKPTDIR
  /* set permissions on checkpoint path, if needed */

	if ((statbuf.st_mode && 01777) != 01777) 
	  {
    chmod(path_checkpoint, 01777);
	  }
#endif

  if (svr_resc_def == NULL)
    {
    if ((rc = init_resc_defs()) != PBSE_NONE)
      {
      log_err(rc, __func__, msg_init_baddb);
      
      return(-1);
      }
    }

#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)

  rc  = chk_file_sec(path_jobs,  1, 0, S_IWGRP | S_IWOTH, 1, EMsg);
  rc |= chk_file_sec(path_queues, 1, 0, S_IWGRP | S_IWOTH, 0, EMsg);
  rc |= chk_file_sec(path_spool, 1, 1, S_IWOTH,        0, EMsg);
  rc |= chk_file_sec(path_acct,  1, 0, S_IWGRP | S_IWOTH, 0, EMsg);
  rc |= chk_file_sec(path_credentials,  1, 0, S_IWGRP | S_IWOTH, 0, EMsg);
  rc |= chk_file_sec(PBS_ENVIRON, 0, 0, S_IWGRP | S_IWOTH, 1, EMsg);

  if (rc != PBSE_NONE)
    {
    return(3);
    }
#endif /* not DEBUG and not NO_SECURITY_CHECK */

  return(rc);
  } /* END initialize_paths() */



int initialize_data_structures_and_mutexes()

  {
  long cray_enabled = FALSE;

  svr_do_schedule_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(svr_do_schedule_mutex, NULL);

  check_tasks_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(check_tasks_mutex, NULL);

  listener_command_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(listener_command_mutex, NULL);

  node_state_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(node_state_mutex, NULL);

  scheduler_sock_jobct_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(scheduler_sock_jobct_mutex, NULL);

  pthread_mutex_lock(scheduler_sock_jobct_mutex);
  scheduler_sock = -1;
  scheduler_jobct = 0;
  pthread_mutex_unlock(scheduler_sock_jobct_mutex);


  /* make the task list child and events mutexes recursive because 
   * they can be called by a signal handler */

  initialize_recycler();
  initialize_batch_request_holder();

  initialize_all_tasks_array(&task_list_timed);
  initialize_all_tasks_array(&task_list_event);

  initialize_all_jobs_array(&alljobs);
  initialize_all_jobs_array(&array_summary);
  initialize_all_jobs_array(&newjobs);
  initialize_hello_container(&hellos);
  initialize_hello_container(&failures);
  initialize_task_recycler();
  initialize_queue_recycler();

  CLEAR_HEAD(svr_newnodes);

  initialize_all_arrays_array();

  initialize_allques_array(&svr_queues);

  get_svr_attr_l(SRV_ATR_CrayEnabled, &cray_enabled);
  if (cray_enabled == TRUE)
    {
    initialize_login_holder();
    initialize_alps_reservations();
    }
  
  acctfile_mutex = calloc(1, sizeof(pthread_mutex_t));
  pthread_mutex_init(acctfile_mutex, NULL);

  return(PBSE_NONE);
  } /* END initialize_data_structures_and_mutexes() */



int setup_server_attrs(
    
  int type)

  {
  int i;
  int rc = PBSE_NONE;

  pthread_mutex_lock(server.sv_attr_mutex);
  for (i = 0; i < SRV_ATR_LAST; i++)
    clear_attr(&server.sv_attr[i], &svr_attr_def[i]);

  server.sv_attr[SRV_ATR_scheduler_iteration].at_val.at_long =  PBS_SCHEDULE_CYCLE;
  server.sv_attr[SRV_ATR_scheduler_iteration].at_flags = ATR_VFLAG_SET;

  server.sv_attr[SRV_ATR_State].at_val.at_long = SV_STATE_INIT;
  server.sv_attr[SRV_ATR_State].at_flags = ATR_VFLAG_SET;

  svr_attr_def[SRV_ATR_mailfrom].at_decode(
    &server.sv_attr[SRV_ATR_mailfrom],
    0,
    0,
    PBS_DEFAULT_MAIL,
    0);

  server.sv_attr[SRV_ATR_tcp_timeout].at_val.at_long = PBS_TCPTIMEOUT;
  server.sv_attr[SRV_ATR_tcp_timeout].at_flags = ATR_VFLAG_SET;

  server.sv_attr[SRV_ATR_check_rate].at_val.at_long = PBS_NORMAL_PING_RATE / 2;
  server.sv_attr[SRV_ATR_check_rate].at_flags = ATR_VFLAG_SET;

  server.sv_attr[SRV_ATR_JobStatRate].at_val.at_long = PBS_RESTAT_JOB;
  server.sv_attr[SRV_ATR_JobStatRate].at_flags = ATR_VFLAG_SET;

  server.sv_attr[SRV_ATR_PollJobs].at_val.at_long = PBS_POLLJOBS;
  server.sv_attr[SRV_ATR_PollJobs].at_flags = ATR_VFLAG_SET;

  server.sv_attr[SRV_ATR_MomJobSync].at_flags = ATR_VFLAG_SET;
  server.sv_attr[SRV_ATR_MomJobSync].at_val.at_long = 1;

  server.sv_attr[SRV_ATR_MoabArrayCompatible].at_val.at_long = TRUE;
  server.sv_attr[SRV_ATR_MoabArrayCompatible].at_flags = ATR_VFLAG_SET;

  /* force logging of all types */
  server.sv_attr[SRV_ATR_log_events].at_val.at_long = PBSEVENT_MASK;
  server.sv_attr[SRV_ATR_log_events].at_flags = ATR_VFLAG_SET;

  /* If not a "create" initialization, recover server db */
  rc = chk_save_file(path_svrdb);

  if (type != RECOV_CREATE)
    {
    /* Open the server database (save file) and read it in */
    if ((rc != PBSE_NONE) || ((rc = svr_recov_xml(path_svrdb, FALSE)) == -1)) 
      {
      log_err(rc, __func__, msg_init_baddb);

      return(-1);
      }

    if (server.sv_attr[SRV_ATR_resource_assn].at_flags & ATR_VFLAG_SET)
      {
      svr_attr_def[SRV_ATR_resource_assn].at_free(
        &server.sv_attr[SRV_ATR_resource_assn]);
      }
    }
  else
    {
    if (rc == PBSE_NONE)
      {
      /* path_svrdb exists */
      rm_files(path_priv);

      pthread_mutex_unlock(server.sv_attr_mutex);
      svr_save(&server, SVR_SAVE_FULL);
      pthread_mutex_lock(server.sv_attr_mutex);
      }
    }

  rc = PBSE_NONE;

  svr_attr_def[SRV_ATR_version].at_decode(
    &server.sv_attr[SRV_ATR_version],
    0,
    0,
    PACKAGE_VERSION,
    0);

  /* open accounting file and job log file if logging is set */
  if (acct_open(acct_file) != 0)
    {
    pthread_mutex_unlock(server.sv_attr_mutex);
    return(-1);
    }

  if (server.sv_attr[SRV_ATR_RecordJobInfo].at_val.at_long)
    {
    rc = job_log_open(job_log_file, path_jobinfo_log);

    if (rc != PBSE_NONE)
      {
      fprintf(stderr, "Could not open job_logs \n");
      pthread_mutex_unlock(server.sv_attr_mutex);
      return(-1);
      }
    }

  /* set up other server and global variables */
  if (a_opt_init != -1)
    {
    /* a_option was set, overrides saved value of scheduling attr */
    server.sv_attr[SRV_ATR_scheduling].at_val.at_long = a_opt_init;
    server.sv_attr[SRV_ATR_scheduling].at_flags |= ATR_VFLAG_SET;
    }
      
  pthread_mutex_unlock(server.sv_attr_mutex);

  return(rc);
  } /* END setup_server_attrs() */



int initialize_nodes()

  {
  initialize_all_nodes_array(&allnodes);

  if (setup_nodes() == -1)
    {
    return(-1);
    }

  add_server_names_to_acl_hosts();
  update_default_np();

  return(PBSE_NONE);
  } /* END initialize_nodes() */




int handle_queue_recovery(
    
  int type)

  {
  int               rc = PBSE_NONE;
  struct dirent    *pdirent;
  DIR              *dir;
  int               had;
  pbs_queue        *pque = NULL;
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  int               logtype;

  if (chdir(path_queues) != 0)
    {
    sprintf(log_buf, msg_init_chdir, path_queues);

    log_err(errno, __func__, log_buf);

    return(-1);
    }

  had = server.sv_qs.sv_numque;
  server.sv_qs.sv_numque = 0;

  dir = opendir(".");

  if (dir == NULL)
    {
    log_err(-1, __func__, msg_init_noqueues);

    sprintf(log_buf, "%s:1", __func__);
    unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

    return(-1);
    }

  while ((pdirent = readdir(dir)) != NULL)
    {
    if (pdirent->d_name == NULL)
      {
      /* invalid name returned */
      continue;
      }

    if (chk_save_file(pdirent->d_name) == 0)
      {
      /* recover queue */
      if ((pque = que_recov_xml(pdirent->d_name)) != NULL)
        {
        /* que_recov increments sv_numque */
        sprintf(log_buf, msg_init_recovque, pque->qu_qs.qu_name);

        log_event(
          PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
          PBS_EVENTCLASS_SERVER,
          msg_daemonname,
          log_buf);

        if (pque->qu_attr[QE_ATR_ResourceAssn].at_flags & ATR_VFLAG_SET)
          {
          que_attr_def[QE_ATR_ResourceAssn].at_free(&pque->qu_attr[QE_ATR_ResourceAssn]);
          }

        unlock_queue(pque, __func__, NULL, LOGLEVEL);
        }
      }
    }

  closedir(dir);

  if ((had != server.sv_qs.sv_numque) && (type != RECOV_CREATE))
    logtype = PBSEVENT_ERROR | PBSEVENT_SYSTEM;
  else
    logtype = PBSEVENT_SYSTEM;

  sprintf(log_buf, msg_init_expctq, had, server.sv_qs.sv_numque);

  log_event(logtype, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf);

  return(rc);
  } /* END handle_queue_recovery() */




int handle_array_recovery(
    
  int type)

  {
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  struct dirent    *pdirent;
  DIR              *dir;
  int               rc = PBSE_NONE;
  job_array        *pa = NULL;
  int               baselen = 0;
  int               array_suf_len = strlen(ARRAY_FILE_SUFFIX);
  char             *psuffix;

  if (chdir(path_arrays) != 0)
    {
    sprintf(log_buf, msg_init_chdir, path_arrays);

    log_err(errno, __func__, log_buf);

    sprintf(log_buf, "%s:2", __func__);
    unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

    return(-1);
    }

  dir = opendir(".");

  while ((pdirent = readdir(dir)) != NULL)
    {
    if (chk_save_file(pdirent->d_name) == PBSE_NONE)
      {
      /* if not create or clean recovery, recover arrays */

      if ((type != RECOV_CREATE) && 
          (type != RECOV_COLD))
        {
        /* skip files without the proper suffix */
        baselen = strlen(pdirent->d_name) - array_suf_len;

        psuffix = pdirent->d_name + baselen;

        if (strcmp(psuffix, ARRAY_FILE_SUFFIX))
          continue;

        if ((rc = array_recov(pdirent->d_name, &pa)) != PBSE_NONE)
          {
          sprintf(log_buf,
            "could not recover array-struct from file %s--skipping. job array can not be recovered.",
            pdirent->d_name);

          log_err(errno, __func__, log_buf);

          sprintf(log_buf, "%s:3", __func__);
          unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

          return(rc);
          }

        pa->jobs_recovered = 0;

        pthread_mutex_unlock(pa->ai_mutex);

        if (LOGLEVEL >= 7)
          {
          sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
          log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, log_buf);
          }
        }
      else
        {
        unlink(pdirent->d_name);
        }
      }
    }

  closedir(dir);

  return(rc);
  } /* handle_array_recovery() */




int handle_job_recovery(

  int type)

  {
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  struct dirent    *pdirent;
  DIR              *dir;
  int               had;
  int               rc = PBSE_NONE;
  job              *pjob;
  int               logtype;
  int               baselen = 0;
  char             *psuffix;
  int               job_count = 0; /* Count of recovered jobs */
  char             *job_suffix = JOB_FILE_SUFFIX;
  int               job_suf_len = strlen(job_suffix);
  char              basen[MAXPATHLEN+1];
  int               Index;
  int               iter = -1;

  if (chdir(path_jobs) != 0)
    {
    sprintf(log_buf, msg_init_chdir, path_jobs);

    log_err(errno, __func__, log_buf);

    sprintf(log_buf, "%s:1", __func__);
    unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

    return(-1);
    }

  had = server.sv_qs.sv_numjobs;

  server.sv_qs.sv_numjobs = 0;
  sprintf(log_buf, "%s:2", __func__);
  unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

  dir = opendir(".");

  if (dir == NULL)
    {
    if ((type != RECOV_CREATE) && (type != RECOV_COLD))
      {
      if (had == 0)
        {
        log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, msg_init_nojobs);
        }
      else
        {
        sprintf(log_buf, msg_init_exptjobs, had, 0);

        log_err(-1, __func__, log_buf);
        }
      }
    }
  else
    {
    darray_t Array;
    DArrayInit(&Array,100);
    /* Now, for each job found ... */

    while ((pdirent = readdir(dir)) != NULL)
      {
      job_count++;
      if ((job_count % 1000) == 0)
        {
        snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "%d files read from disk", job_count);
        log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf);
        }

      if (chk_save_file(pdirent->d_name) == 0)
        {
        /* recover the jobs */
        baselen = strlen(pdirent->d_name) - job_suf_len;

        psuffix = pdirent->d_name + baselen;

        if (!strcmp(psuffix, ".TA"))
          {
          if ((pjob = job_recov(pdirent->d_name)) != NULL)
            {
            pjob->ji_is_array_template = TRUE;

            if (DArrayAppend(&Array,pjob) == FAILURE)
              {
              log_err(ENOMEM,"main","out of memory reloading jobs");
              exit(-1);
              }

            if (type == RECOV_COLD)
              pjob->ji_cold_restart = TRUE;
            
            pthread_mutex_unlock(pjob->ji_mutex);
            }

          continue;
          }

        if (strcmp(psuffix, job_suffix))
          continue;

        if ((pjob = job_recov(pdirent->d_name)) != NULL)
          {

          if (DArrayAppend(&Array,pjob) == FAILURE)
            {
            log_err(ENOMEM, "main", "out of memory reloading jobs");
            exit(-1);
            }

          if (type == RECOV_COLD)
            pjob->ji_cold_restart = TRUE;

          pthread_mutex_unlock(pjob->ji_mutex);
          }
        else
          {
          sprintf(log_buf, msg_init_badjob, pdirent->d_name);

          log_err(-1, __func__, log_buf);

          /* remove corrupt job */
          snprintf(basen, sizeof(basen), "%s%s", pdirent->d_name, JOB_BAD_SUFFIX);

          if (link(pdirent->d_name, basen) < 0)
            {
            log_err(errno, __func__, "failed to link corrupt .JB file to .BD");
            }
          else
            {
            unlink(pdirent->d_name);
            }
          }
        }
      }    /* END while ((pdirent = readdir(dir)) != NULL) */

    snprintf(log_buf, LOCAL_LOG_BUF_SIZE, "%d total files read from disk", job_count);
    log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, msg_daemonname, log_buf);
    closedir(dir);
    qsort(Array.Data, Array.AppendIndex, sizeof(Array.Data[0]), SortPrioAscend);

    for (Index = 0; Index < Array.AppendIndex; Index++)
      {
      job *pjob = (job *)Array.Data[Index];

      pthread_mutex_lock(pjob->ji_mutex);

      if (pbsd_init_job(pjob, type) == FAILURE)
        {
        log_event(
          PBSEVENT_ERROR | PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_JOB | PBSEVENT_FORCE,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          msg_script_open);

        pthread_mutex_unlock(pjob->ji_mutex);

        continue;
        }

      if ((type != RECOV_COLD) &&
          (type != RECOV_CREATE) &&
          (!(pjob->ji_wattr[JOB_ATR_job_array_request].at_flags & ATR_VFLAG_SET)) &&
          (pjob->ji_qs.ji_svrflags & JOB_SVFLG_SCRIPT))
        {
        snprintf(basen, sizeof(basen), "%s%s", pjob->ji_qs.ji_fileprefix, JOB_SCRIPT_SUFFIX);

        if (chk_save_file(basen) != 0)
          {
          log_event(
            PBSEVENT_ERROR | PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_JOB | PBSEVENT_FORCE,
            PBS_EVENTCLASS_JOB,
            pjob->ji_qs.ji_jobid,
            msg_script_open);

          init_abt_job(pjob);
          }
        else
          {
          pthread_mutex_unlock(pjob->ji_mutex);
          }
        }
      else
        pthread_mutex_unlock(pjob->ji_mutex);
      }

    DArrayFree(&Array);
    sprintf(log_buf, "%s:1", __func__);
    lock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

    if ((had != server.sv_qs.sv_numjobs) &&
        (type != RECOV_CREATE) &&
        (type != RECOV_COLD))
      {
      logtype = PBSEVENT_ERROR | PBSEVENT_SYSTEM;
      }
    else
      {
      logtype = PBSEVENT_SYSTEM;
      }

    sprintf(log_buf, msg_init_exptjobs, had, server.sv_qs.sv_numjobs);
    sprintf(log_buf, "%s:3", __func__);
    unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);
    log_event(logtype,PBS_EVENTCLASS_SERVER,msg_daemonname,log_buf);
    }  /* END else */

  /* If queue_rank has gone negative, renumber all jobs and reset rank */
  if (queue_rank < 0)
    {
    iter = -1;

    queue_rank = 0;

    while ((pjob = next_job(&alljobs, &iter)) != NULL)
      {
      pjob->ji_wattr[JOB_ATR_qrank].at_val.at_long = ++queue_rank;
      
      job_save(pjob, SAVEJOB_FULL, 0);
      
      pthread_mutex_unlock(pjob->ji_mutex);
      }
    }

  return(rc);
  } /* END handle_job_recovery() */




int cleanup_recovered_arrays()

  {
  job_array *pa;
  job       *pjob;
  int        iter = -1;
  int        rc = PBSE_NONE;
  char       log_buf[LOCAL_LOG_BUF_SIZE];

  while ((pa = next_array(&iter)) != NULL)
    {
    int job_template_exists = FALSE;

    pthread_mutex_lock(pa->ai_mutex);

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "%s: locked ai_mutex", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
      }
     
    if ((pjob = find_job(pa->ai_qs.parent_id)) != NULL)
      {
      job_template_exists = TRUE;
      pthread_mutex_unlock(pjob->ji_mutex);
      }

    /* if no jobs were recovered, delete this array */
    if (pa->jobs_recovered == 0)
      {
      if ((pjob = find_job(pa->ai_qs.parent_id)) != NULL)
        job_purge(pjob);

      array_delete(pa);

      /* move on to the next array */
      continue;
      }

    /* see if we need to upgrade the array version. */
    /* We will upgrade from version 3 or later */
    if (pa->ai_qs.struct_version == 3)
      {
      pa->ai_qs.struct_version = ARRAY_QS_STRUCT_VERSION;
      pa->ai_qs.num_purged = pa->ai_qs.num_jobs - pa->jobs_recovered;
      array_save(pa);
      }

    if (pa->ai_qs.num_cloned != pa->ai_qs.num_jobs)
      {
      /* if we can't finish building the job array then delete whats been done
         so far */

      if (job_template_exists == FALSE)
        {
        int        i;

        for (i = 0; i < pa->ai_qs.array_size; i++)
          {
          if (pa->job_ids[i] != NULL)
            {
            if ((pjob = find_job(pa->job_ids[i])) != NULL)
              {
              pthread_mutex_unlock(pa->ai_mutex);
              job_purge(pjob);
              pthread_mutex_lock(pa->ai_mutex);
              }
            }
          }

        array_delete(pa);
        continue;
        }
      else
        {
        /* TODO Someone must have been naughty and did a kill -9 on pbs_server,
           we might need to validate that the last job was fully initialized
           before continuing the cloning process. */
        enqueue_threadpool_request(job_clone_wt, strdup(pa->ai_qs.parent_id));
        }

      }
    else if ((pa->ai_qs.jobs_done == pa->ai_qs.num_jobs) && 
             (job_template_exists == FALSE))
      {
      array_delete(pa);
      continue;
      }

    if (LOGLEVEL >= 7)
      {
      sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
      log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pa->ai_qs.parent_id, log_buf);
      }
    
    pthread_mutex_unlock(pa->ai_mutex);
    } /* END for each array */

  return(rc);
  } /* END cleanup_recovered_arrays() */




int handle_job_and_array_recovery(

  int type)

  {
  int rc;

  if ((rc = handle_array_recovery(type)) != PBSE_NONE)
    return(rc);
  else if ((rc = handle_job_recovery(type)) != PBSE_NONE)
    return(rc);
  else
    rc = cleanup_recovered_arrays();

  return(rc);
  } /* END handle_job_and_array_recovery() */




int handle_tracking_records()

  {
  int          fd;
  int          rc = PBSE_NONE;
  int          i;
  struct stat  statbuf;
#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
  char         EMsg[1024];
#endif /* not DEBUG and not NO_SECURITY_CHECK */

  if ((fd = open(path_track, O_RDONLY | O_CREAT, 0600)) < 0)
    {
    log_err(errno, __func__, "unable to open tracking file");

    return(-1);
    }

#if !defined(DEBUG) && !defined(NO_SECURITY_CHECK)
  if (chk_file_sec(path_track, 0, 0, S_IWGRP | S_IWOTH, 0, EMsg) != 0)
    {
    return(-1);
    }

#endif  /* not DEBUG and not NO_SECURITY_CHECK */

  if (fstat(fd, &statbuf) < 0)
    {
    log_err(errno, "pbs_init", "unable to stat tracking file");

    return(-1);
    }

  server.sv_tracksize = (statbuf.st_size + sizeof(struct tracking) - 1) / sizeof(struct tracking);

  if (server.sv_tracksize < PBS_TRACK_MINSIZE)
    server.sv_tracksize = PBS_TRACK_MINSIZE;

  if ((server.sv_track = calloc(server.sv_tracksize, sizeof(struct tracking))) == NULL)
    {
    /* FAILURE - cannot alloc memory */
    log_err(errno, "pbs_init", "calloc failure");

    return(-1);
    }

  for (i = 0; i < server.sv_tracksize; i++)
    (server.sv_track + i)->tk_mtime = 0;

  /* NOTE:  tracking file records are optional */
  if (read(fd, (char *)server.sv_track, server.sv_tracksize * sizeof(struct tracking)) < 0)
    {
    log_err(errno, "pbs_init", "unable to read tracksize from tracking file");
    }

  close(fd);

  server.sv_trackmodifed = 0;
  
  /* set work task to periodically save the tracking records */
  set_task(WORK_Timed, (long)(time(NULL) + PBS_SAVE_TRACK_TM), track_save, NULL, FALSE);

  return(rc);
  } /* END handle_tracking_records() */




void setup_threadpool()

  {
  long              min_threads;
  long              max_threads;
  long              thread_idle_time = DEFAULT_THREAD_IDLE;
  
  min_threads = get_default_threads();
  max_threads = min_threads * 10;
  
  /* setup the threadpool for use */
  get_svr_attr_l(SRV_ATR_minthreads, &min_threads);
  get_svr_attr_l(SRV_ATR_maxthreads, &max_threads);
  get_svr_attr_l(SRV_ATR_threadidleseconds, &thread_idle_time);
  
  initialize_threadpool(&request_pool, min_threads, max_threads, thread_idle_time);
  } /* END setup_threadpool() */




/*
 * This file contains the functions to initialize the PBS Batch Server.
 * The code is called once when the server is brought up.
 */

int pbsd_init(

  int type)  /* type of initialization   */

  {
  int               ret = PBSE_NONE;
  gid_t             gid;
  char              log_buf[LOCAL_LOG_BUF_SIZE];

  memset(&hints, 0, sizeof(hints));
  hints.ai_flags = AI_CANONNAME;

  /* The following is code to reduce security risks */
  if (setup_env(PBS_ENVIRON) == -1)
    {
    return(-1);
    }

  gid = getgid();

  /* secure suppl. groups */
  if (setgroups(1, &gid) != 0)
    {
    snprintf(log_buf, sizeof(log_buf),
      "Unable to drop secondary groups. Some MAC framework is active?\n");
    log_err(errno, __func__, log_buf);
    snprintf(log_buf, sizeof(log_buf),
      "setgroups(group = %lu) failed: %s\n",
      (unsigned long)gid, strerror(errno));
    log_err(errno, __func__, log_buf);

    return(-1);
    }

  setup_threadpool();

  setup_limits();

  initialize_network_info();

  /* 1. set up to catch or ignore various signals */
  if ((ret = setup_signal_handling()) != PBSE_NONE)
    return(ret);

  /* 2. set up the various paths and other global variables we need */
  if ((ret = initialize_paths()) != PBSE_NONE)
    return(ret);

  initialize_data_structures_and_mutexes();

  /* 3. Set default server attibutes values */
  if ((ret = setup_server_attrs(type)) != PBSE_NONE)
    return(ret);

  /* Open and read in node list if one exists */
  if ((ret = initialize_nodes()) != PBSE_NONE)
    return(ret);

  /* the functions we're calling assume this mutex is locked */
  sprintf(log_buf, "%s:1", __func__);
  lock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

  if ((ret = handle_queue_recovery(type)) != PBSE_NONE)
    return(ret);

  if ((ret = handle_job_and_array_recovery(type)) != PBSE_NONE)
    return(ret);

  /* Put us back in the Server's Private directory */
  if (chdir(path_priv) != 0)
    {
    sprintf(log_buf, msg_init_chdir, path_priv);

    log_err(-1, __func__, log_buf);

    return(3);
    }

  handle_tracking_records();

  /* read the hierarchy file */
  if ((hierarchy_holder = prepare_mom_hierarchy()) == NULL)
    {
    /* hierarchy file exists but we couldn't open it */
    return(-1);
    }

  /* mark all nodes as needing a hello */
  add_all_nodes_to_hello_container();

  /* allow the threadpool to start processing */
  start_request_pool();

  /* SUCCESS */
  return(PBSE_NONE);
  }  /* END pbsd_init() */


/*
 * build_path - build the pathname for a PBS directory
 */

char *build_path(

  char *parent,  /* parent directory name (dirname) */
  char *name,  /* sub directory name */
  char *suffix)  /* suffix string to append */

  {
  int   prefixslash;
  char *ppath;
  size_t len;

  /*
   * allocate space for the names + maybe a slash between + the suffix
   */

  if (*(parent + strlen(parent) - 1)  == '/')
    prefixslash = 0;
  else
    prefixslash = 1;

  len = strlen(parent) + strlen(name) + prefixslash + 1;

  if (suffix != NULL)
    len += strlen(suffix);

  ppath = calloc(1, PATH_MAX);

  if (ppath != NULL)
    {
    strcat(ppath, parent);

    if (prefixslash)
      strcat(ppath, "/");

    strcat(ppath, name);

    if (suffix)
      strcat(ppath, suffix);

    return(ppath);
    }

  log_err(errno, "build_path", msg_err_malloc);

  pthread_mutex_lock(log_mutex);
  log_close(1);
  pthread_mutex_unlock(log_mutex);

  exit(3);
  }  /* END build_path() */


/*
 * pbsd_init_job - decide what to do with the recovered job structure
 *
 * The action depends on the type of initialization.
 */

int pbsd_init_job(

  job *pjob,  /* I */
  int  type)  /* I */

  {
  unsigned int      d;

  time_t            time_now = time(NULL);
  char              log_buf[LOCAL_LOG_BUF_SIZE];
  int               local_errno = 0;
  char  job_id[PBS_MAXSVRJOBID+1];
  long  job_atr_hold;
  int   job_exit_status;

  pjob->ji_momhandle = -1;

  /* update at_server pbs_attribute in case name changed */

  job_attr_def[JOB_ATR_at_server].at_free(
    &pjob->ji_wattr[JOB_ATR_at_server]);

  job_attr_def[JOB_ATR_at_server].at_decode(
    &pjob->ji_wattr[JOB_ATR_at_server],
    NULL,
    NULL,
    server_name,
    0);

  /* update queue_rank if this job is higher than current */

  if ((unsigned long)pjob->ji_wattr[JOB_ATR_qrank].at_val.at_long > (unsigned long)queue_rank)
    queue_rank = pjob->ji_wattr[JOB_ATR_qrank].at_val.at_long;

  /* now based on the initialization type */

  if ((type == RECOV_COLD) || (type == RECOV_CREATE))
    {
/*    need_y_response(type);*/

    init_abt_job(pjob);

    return(FAILURE);
    }

  if (type != RECOV_HOT)
    pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_HOTSTART;

  switch (pjob->ji_qs.ji_substate)
    {

    case JOB_SUBSTATE_TRANSICM:

      if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE)
        {
        /*
         * This server created the job, so client
         * was qsub (a transient client), it won't be
         * around to recommit, so auto-commit now
         */

        pjob->ji_qs.ji_state = JOB_STATE_QUEUED;
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED;

        pbsd_init_reque(pjob, CHANGE_STATE);
        }
      else
        {
        /*
         * another server is sending, append to new job
         * list and wait for commit; need to clear
         * receiving socket number though
         */

        pjob->ji_qs.ji_un.ji_newt.ji_fromsock = -1;

        insert_job(&newjobs,pjob);
        }

      break;

    case JOB_SUBSTATE_TRNOUT:

      pjob->ji_qs.ji_state = JOB_STATE_QUEUED;
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_QUEUED;

      /* requeue as queued */

      pbsd_init_reque(pjob, CHANGE_STATE);

      break;

    case JOB_SUBSTATE_TRNOUTCM:

      /* requeue as is - rdy to cmt */

      pbsd_init_reque(pjob, KEEP_STATE);

      /* resend rtc */

      set_task(WORK_Immed, 0, resume_net_move, strdup(pjob->ji_qs.ji_jobid), FALSE);

      break;

    case JOB_SUBSTATE_QUEUED:

    case JOB_SUBSTATE_PRESTAGEIN:

    case JOB_SUBSTATE_STAGEIN:

    case JOB_SUBSTATE_STAGECMP:

    case JOB_SUBSTATE_STAGEFAIL:

    case JOB_SUBSTATE_STAGEGO:

    case JOB_SUBSTATE_CHKPTGO:

    case JOB_SUBSTATE_CHKPTCMP:

    case JOB_SUBSTATE_HELD:

    case JOB_SUBSTATE_SYNCHOLD:

    case JOB_SUBSTATE_DEPNHOLD:

    case JOB_SUBSTATE_WAITING:

    case JOB_SUBSTATE_PRERUN:

    case JOB_SUBSTATE_ARRAY_TEMP:

      pbsd_init_reque(pjob, CHANGE_STATE);

      break;

    case JOB_SUBSTATE_RUNNING:

      pbsd_init_reque(pjob, KEEP_STATE);

      pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_RescAssn;

      set_resc_assigned(pjob, INCR);

      /* suspended jobs don't get reassigned to nodes */

      if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0)
        {
        set_old_nodes(pjob);
        }

      if (type == RECOV_HOT)
        pjob->ji_qs.ji_svrflags |= JOB_SVFLG_HOTSTART;

      break;

    case JOB_SUBSTATE_SYNCRES:

      /* clear all dependent job ready flags */

      depend_clrrdy(pjob);

      pbsd_init_reque(pjob, CHANGE_STATE);

      break;

    case JOB_SUBSTATE_EXITING:

    case JOB_SUBSTATE_STAGEOUT:

    case JOB_SUBSTATE_STAGEDEL:

    case JOB_SUBSTATE_EXITED:

    case JOB_SUBSTATE_ABORT:

      /* This is delayed because it is highly likely MS is "state-unknown"
       * at this time, and there's no real hurry anyways. */

      apply_job_delete_nanny(pjob, time_now + 60);

      set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);

      pbsd_init_reque(pjob, KEEP_STATE);

      break;

    case JOB_SUBSTATE_COMPLETE:

      /* Completed jobs are no longer purged on startup */
      set_task(WORK_Immed, 0, on_job_exit, strdup(pjob->ji_qs.ji_jobid), FALSE);

      pbsd_init_reque(pjob, KEEP_STATE);

      /* do array bookeeping */
      if ((pjob->ji_arraystruct != NULL) &&
          (pjob->ji_is_array_template == FALSE))
        {
        job_array *pa = get_jobs_array(&pjob);

        if (pjob != NULL)
          {
          strcpy(job_id, pjob->ji_qs.ji_jobid);
          job_atr_hold = pjob->ji_wattr[JOB_ATR_hold].at_val.at_long;
          job_exit_status = pjob->ji_qs.ji_un.ji_exect.ji_exitstat;
          pthread_mutex_unlock(pjob->ji_mutex);
          update_array_values(pa,JOB_STATE_RUNNING,aeTerminate,
              job_id, job_atr_hold, job_exit_status);
          
          if (LOGLEVEL >= 7)
            {
            sprintf(log_buf, "%s: unlocking ai_mutex", __func__);
            log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, job_id, log_buf);
            }
          pthread_mutex_unlock(pa->ai_mutex);
          pjob = find_job(job_id);
          }
         
        }

      break;

    case JOB_SUBSTATE_RERUN:

      if (pjob->ji_qs.ji_state == JOB_STATE_EXITING)
        set_task(WORK_Immed, 0, on_job_rerun, strdup(pjob->ji_qs.ji_jobid), FALSE);

      pbsd_init_reque(pjob, KEEP_STATE);

      break;

    case JOB_SUBSTATE_RERUN1:

    case JOB_SUBSTATE_RERUN2:

      set_task(WORK_Immed, 0, on_job_rerun, strdup(pjob->ji_qs.ji_jobid), FALSE);

      pbsd_init_reque(pjob, KEEP_STATE);

      break;

    default:

      sprintf(log_buf, msg_init_unkstate, pjob->ji_qs.ji_substate);

      log_event(PBSEVENT_ERROR,PBS_EVENTCLASS_JOB,pjob->ji_qs.ji_jobid,log_buf);

      job_abt(&pjob, log_buf); /* pjob is not freed */

      if (pjob == NULL)
        {
        return(FAILURE);
        }

      break;
    }    /* END switch (pjob->ji_qs.ji_substate) */


  /* if job has IP address of Mom, it may have changed */
  /* reset based on hostname                           */

  if (pjob != NULL)
    {
    if ((pjob->ji_qs.ji_un_type == JOB_UNION_TYPE_EXEC) &&
        (pjob->ji_qs.ji_un.ji_exect.ji_momaddr != 0))
      {
      if (pjob->ji_wattr[JOB_ATR_exec_host].at_flags & ATR_VFLAG_SET)
        {
        char *tmp = parse_servername(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, &d);
        pjob->ji_qs.ji_un.ji_exect.ji_momaddr = get_hostaddr(&local_errno, tmp);
        free(tmp);
        }
      else
        {
        pjob->ji_qs.ji_un.ji_exect.ji_momaddr = 0;
        }
      }
    }

  return(SUCCESS);
  }  /* END pbsd_init_job() */





void pbsd_init_reque(

  job *pjob,         /* I (modified/possibly freed) */
  int  change_state) /* I */

  {
  char logbuf[265];
  int  newstate;
  int  newsubstate;
  int  rc;
  char log_buf[LOCAL_LOG_BUF_SIZE];

  sprintf(logbuf, msg_init_substate,
          pjob->ji_qs.ji_substate);

  /* re-enqueue the job into the queue it was in */

  if (change_state)
    {
    /* update the state, typically to some form of QUEUED */

    svr_evaljobstate(pjob, &newstate, &newsubstate, 0);

    svr_setjobstate(pjob, newstate, newsubstate, FALSE);
    }
  else
    {
    set_statechar(pjob);
    }

  sprintf(log_buf, "%s:1", __func__);
  lock_sv_qs_mutex(server.sv_qs_mutex, log_buf);
  if ((rc = svr_enquejob(pjob, TRUE, -1)) == PBSE_NONE)
    {
    strcat(logbuf, msg_init_queued);
    strcat(logbuf, pjob->ji_qs.ji_queue);

    log_event(
      PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      logbuf);
    }
  else
    {
    /* Oops, this should never happen */
    if (rc != PBSE_JOB_RECYCLED)
      {
      sprintf(logbuf, "%s; job %s queue %s",
        msg_err_noqueue,
        pjob->ji_qs.ji_jobid,
        pjob->ji_qs.ji_queue);
    
      log_err(-1, "pbsd_init_reque", logbuf);
      }

    unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

    if (rc != PBSE_JOB_RECYCLED)
      job_abt(&pjob, logbuf);

    lock_sv_qs_mutex(server.sv_qs_mutex, log_buf);

    /* NOTE:  pjob freed but dangling pointer remains */
    }

  sprintf(log_buf, "%s:1", __func__);
  unlock_sv_qs_mutex(server.sv_qs_mutex, log_buf);
  }  /* END pbsd_init_reque() */




/*
 * Catch core dump signals - set core size so we can see what happened!
 *
 * Chris Samuel - VPAC
 * csamuel@vpac.org - 29th July 2003
 */

void catch_abort(

  int sig)  /* I */

  {

  struct rlimit rlimit;

  struct sigaction act;

  /*
   * Reset ourselves to the default signal handler to try and
   * prevent recursive core dumps.
   */

  sigemptyset(&act.sa_mask);
  act.sa_flags   = 0;
  act.sa_handler = SIG_DFL;

  sigaction(SIGSEGV, &act, NULL);
  sigaction(SIGBUS, &act, NULL);
  sigaction(SIGFPE, &act, NULL);
  sigaction(SIGILL, &act, NULL);
  sigaction(SIGTRAP, &act, NULL);
  sigaction(SIGSYS, &act, NULL);

  log_err(sig, "mom_main", "Caught fatal core signal");

  rlimit.rlim_cur = RLIM_INFINITY;
  rlimit.rlim_max = RLIM_INFINITY;

  setrlimit(RLIMIT_CORE, &rlimit);
  abort();

  return;
  }  /* END catch_abort() */





/*
 * changs_logs - signal handler for SIGHUP
 * Causes the accounting file and log file to be closed and reopened.
 * Thus the old one can be renamed.
 */

void change_logs(

  int sig)

  {
  long record_job_info = FALSE;
  acct_close();
  pthread_mutex_lock(log_mutex);
  log_close(1);
  log_open(log_file, path_log);
  pthread_mutex_unlock(log_mutex);

  acct_open(acct_file);

  get_svr_attr_l(SRV_ATR_RecordJobInfo, &record_job_info);
  if (record_job_info)
    {
    pthread_mutex_lock(job_log_mutex);
    job_log_open(job_log_file, path_jobinfo_log);
    pthread_mutex_unlock(job_log_mutex);
    }

  rpp_dbprt = 1 - rpp_dbprt; /* toggle debug prints for RPP */

  return;
  }

/*
 * change_log_level - signal handler for SIGUSR! and SIGUSR2
 * Increases log level if SIGUSR1 is received.
 * Decreases log level if SIGUSR2 is received.
 * Variable plogenv tells us whether or not PBSLOGLEVEL was specified
 * If it was not then we will update the server log level pbs_attribute
 * which allows qmgr to see the current log level value
 */

void change_log_level(

  int sig)

  {
  char log_buf[LOCAL_LOG_BUF_SIZE];
  long level = 0;
  get_svr_attr_l(SRV_ATR_LogLevel, &level);

  if (sig == SIGUSR1)
    {
    /* increase log level */

    if (plogenv == NULL)
      LOGLEVEL = level;

    LOGLEVEL = MIN(LOGLEVEL + 1, 7);

    if (plogenv == NULL)
      {
      set_svr_attr(SRV_ATR_LogLevel, &LOGLEVEL);
      }
    }
  else if (sig == SIGUSR2)
    {
    /* decrease log level */
    if (plogenv == NULL)
      LOGLEVEL = level;

    LOGLEVEL = MAX(LOGLEVEL - 1, 0);

    if (plogenv == NULL)
      {
      set_svr_attr(SRV_ATR_LogLevel, &LOGLEVEL);
      }
    }

  sprintf(log_buf, "received signal %d: adjusting loglevel to %d", sig, LOGLEVEL);

  log_record(
    PBSEVENT_SYSTEM | PBSEVENT_FORCE,
    PBS_EVENTCLASS_SERVER,
    msg_daemonname,
    log_buf);

  return;
  }  /* END change_log_level() */




/*
 * stop_me - signal handler for all caught signals which terminate the server
 *
 * Record the signal so an log_event call can be made outside of
 * the handler, and set the server state to indicate we should shut down.
 */




/*ARGSUSED*/

void stop_me(

  int sig)

  {
  long state = SV_STATE_SHUTSIG;
  set_svr_attr(SRV_ATR_State, &state);

  return;
  }



int chk_save_file(

  char *filename)

  {

  struct stat sb;

  if (*filename == '.')
    {
    return(-1);
    }

  if (stat(filename, &sb) == -1)
    {
    return(errno);
    }

  if (S_ISREG(sb.st_mode))
    {
    return(0);
    }

  return(-1);
  }





/*
 * resume_net_move - call net_move() to complete the routing of a job
 * This is invoked via a work task created on recovery of a job
 * in JOB_SUBSTATE_TRNOUTCM state.
 */

void resume_net_move(

  struct work_task *ptask)

  {
  char *jobid = ptask->wt_parm1;
  job  *pjob;

  if (jobid != NULL)
    {
    pjob = find_job(jobid);
  
    net_move(pjob, 0);
    
    pthread_mutex_unlock(pjob->ji_mutex);

    free(jobid);
    }

  free(ptask->wt_mutex);
  free(ptask);
  } /* END resume_net_move() */


/*
 * rm_files - on an RECOV_CREATE, remove all files under the specified
 * directory (path_priv) and any subdirectory except under "jobs".
 */

void rm_files(

  char *dirname)

  {
  DIR           *dir;
  int            i;

  struct stat    stb;

  struct dirent *pdirt;
  char           path[1024];
  char           log_buf[LOCAL_LOG_BUF_SIZE];

  /* list of directories in which files are removed */

  static char *byebye[] =
    {
    "acl_groups",
    "acl_hosts",
    "acl_svr",
    "acl_users",
    "hostlist",
    "queues",
    NULL
    };      /* keep as last entry */

  dir = opendir(dirname);

  if (dir != NULL)
    {
    while ((pdirt = readdir(dir)) != NULL)
      {
      snprintf(path, sizeof(path), "%s/%s", dirname, pdirt->d_name);

      if (stat(path, &stb) == 0)
        {
        if (S_ISDIR(stb.st_mode))
          {
          for (i = 0; byebye[i]; ++i)
            {
            if (strcmp(pdirt->d_name, byebye[i]) == 0)
              {
              rm_files(path);
              }
            }
          }
        else if (unlink(path) == -1)
          {
          sprintf(log_buf, "cannot unlink %s", path);

          log_err(errno, "pbsd_init", log_buf);
          }
        }
      }
    closedir(dir);
    }

  return;
  }  /* END rm_files() */





/*
 * init_abt_job() - log and email owner message that job is being aborted at
 * initialization; then purge job (must be called after job is enqueued.
 */

void init_abt_job(

  job *pjob)

  {
  log_event(
    PBSEVENT_SYSTEM | PBSEVENT_ADMIN | PBSEVENT_DEBUG,
    PBS_EVENTCLASS_JOB,
    pjob->ji_qs.ji_jobid,
    msg_init_abt);

  svr_mailowner(pjob, MAIL_ABORT, MAIL_NORMAL, msg_init_abt);

  job_purge(pjob);

  return;
  }




/*
 * This just reads in the server attributes from the server db.
 */

int recov_svr_attr(

  int type)		/* type of initialization   */

  {
  int	 rc;
  char	*suffix_slash = "/";

  if (type != RECOV_CREATE)
    {
    /* Open the server database (save file) and read it in */

    if (path_priv == NULL)
      {
      path_priv = build_path(path_home, PBS_SVR_PRIVATE, suffix_slash);
      }
    if (path_svrdb == NULL)
      {
      path_svrdb     = build_path(path_priv, PBS_SERVERDB, NULL);
      }

    if (svr_resc_def == NULL)
      {
      rc = init_resc_defs();
      if (rc != 0)
        {
        log_err(rc, "pbsd_init", msg_init_baddb);

        return(-1);
        }
      }

    if (((rc = chk_save_file(path_svrdb))!= 0) || ((rc = svr_recov_xml(path_svrdb, TRUE)) == -1)) 
      {
      log_err(rc, __func__, msg_init_baddb);

      return(-1);
      }

    } 

  return(0);
  }  /* END recov_svr_attr() */

