/*
*         OpenPBS (Portable Batch System) v2.3 Software License
* 
* Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
* All rights reserved.
* 
* ---------------------------------------------------------------------------
* For a license to use or redistribute the OpenPBS software under conditions
* other than those described below, or to purchase support for this software,
* please contact Veridian Systems, PBS Products Department ("Licensor") at:
* 
*    www.OpenPBS.org  +1 650 967-4675                  sales@OpenPBS.org
*                        877 902-4PBS (US toll-free)
* ---------------------------------------------------------------------------
* 
* This license covers use of the OpenPBS v2.3 software (the "Software") at
* your site or location, and, for certain users, redistribution of the
* Software to other sites and locations.  Use and redistribution of
* OpenPBS v2.3 in source and binary forms, with or without modification,
* are permitted provided that all of the following conditions are met.
* After December 31, 2001, only conditions 3-6 must be met:
* 
* 1. Commercial and/or non-commercial use of the Software is permitted
*    provided a current software registration is on file at www.OpenPBS.org.
*    If use of this software contributes to a publication, product, or
*    service, proper attribution must be given; see www.OpenPBS.org/credit.html
* 
* 2. Redistribution in any form is only permitted for non-commercial,
*    non-profit purposes.  There can be no charge for the Software or any
*    software incorporating the Software.  Further, there can be no
*    expectation of revenue generated as a consequence of redistributing
*    the Software.
* 
* 3. Any Redistribution of source code must retain the above copyright notice
*    and the acknowledgment contained in paragraph 6, this list of conditions
*    and the disclaimer contained in paragraph 7.
* 
* 4. Any Redistribution in binary form must reproduce the above copyright
*    notice and the acknowledgment contained in paragraph 6, this list of
*    conditions and the disclaimer contained in paragraph 7 in the
*    documentation and/or other materials provided with the distribution.
* 
* 5. Redistributions in any form must be accompanied by information on how to
*    obtain complete source code for the OpenPBS software and any
*    modifications and/or additions to the OpenPBS software.  The source code
*    must either be included in the distribution or be available for no more
*    than the cost of distribution plus a nominal fee, and all modifications
*    and additions to the Software must be freely redistributable by any party
*    (including Licensor) without restriction.
* 
* 6. All advertising materials mentioning features or use of the Software must
*    display the following acknowledgment:
* 
*     "This product includes software developed by NASA Ames Research Center,
*     Lawrence Livermore National Laboratory, and Veridian Information 
*     Solutions, Inc.
*     Visit www.OpenPBS.org for OpenPBS software support,
*     products, and information."
* 
* 7. DISCLAIMER OF WARRANTY
* 
* THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY KIND. ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT
* ARE EXPRESSLY DISCLAIMED.
* 
* IN NO EVENT SHALL VERIDIAN CORPORATION, ITS AFFILIATED COMPANIES, OR THE
* U.S. GOVERNMENT OR ANY OF ITS AGENCIES BE LIABLE FOR ANY DIRECT OR INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* 
* This license will be governed by the laws of the Commonwealth of Virginia,
* without reference to its choice of law rules.
*/

#include <pbs_config.h>   /* the master config generated by configure */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <pwd.h>
#include <grp.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include <signal.h>
#include <termios.h>
#include <ctype.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <netinet/in.h>
#if IBM_SP2==2	/* IBM SP with PSSP 3.1 */
#include <st_client.h>
#endif	/* IBM SP */

#if defined(PENABLE_DYNAMIC_CPUSETS)
# define CBUFFERSIZE 4095
# include <cpuset.h>
#endif /* PENABLE_DYNAMIC_CPUSETS */

#include "libpbs.h"
#include "portability.h"
#include "list_link.h"
#include "server_limits.h"
#include "attribute.h"
#include "resource.h"
#include "job.h"
#include "log.h"
#include "rpp.h"
#include "mom_mach.h"
#include "mom_func.h"
#include "pbs_error.h"
#include "svrfunc.h"
#include "net_connect.h"
#include "dis.h"
#include "batch_request.h"
#include "md5.h"
#include "mcom.h"
#ifdef ENABLE_CPA
#include "pbs_cpa.h"
#endif

#define EXTRA_VARIABLE_SPACE 2000
#define EXTRA_ENV_PTRS	       32

/* Global Variables */


extern  int		num_var_env;
extern	char	      **environ;
extern	int		exiting_tasks;
extern	int		lockfds;
extern	tlist_head	mom_polljobs;
extern	char		*path_checkpoint;
extern	char		*path_jobs;
extern	char		*path_prolog;
extern  char            *path_prologuser;
extern  char            *path_prologp;
extern  char            *path_prologuserp;
extern	char		*path_spool;
extern	char		*path_aux;
extern	gid_t		 pbsgroup;
extern	time_t		time_now;
extern	unsigned int	pbs_rm_port;
extern	u_long		localaddr;
extern  char            *nodefile_suffix;

extern int LOGLEVEL;
extern long TJobStartBlockTime;

extern char *get_job_envvar(job *,char *);


int              mom_reader_go;		/* see catchinter() & mom_writer() */
struct var_table vtable;		/* for building up Job's environ */

extern char             tmpdir_basename[];  /* for TMPDIR */

/* Local Varibles */ 

static int	 script_in;	/* script file, will be stdin	  */
static pid_t	 writerpid;	/* writer side of interactive job */
static pid_t	 shellpid;	/* shell part of interactive job  */


static	char *variables_else[] = {	/* variables to add, value computed */
  "HOME",
  "LOGNAME",
  "PBS_JOBNAME",
  "PBS_JOBID",
  "PBS_QUEUE",
  "SHELL",
  "USER",
  "PBS_JOBCOOKIE",
  "PBS_NODENUM",
  "PBS_TASKNUM",
  "PBS_MOMPORT",
  "PBS_NODEFILE",
  "TMPDIR" };

static	int num_var_else = sizeof(variables_else) / sizeof(char *);

/* prototypes */

static	void starter_return A_((int,int,int,struct startjob_rtn *));
static	void catchinter A_((int));

int TMomFinalizeJob1(job *,pjobexec_t *,int *);
int TMomFinalizeJob2(pjobexec_t *,int *);
int TMomFinalizeJob3(pjobexec_t *,int,int,int *);
int TMomFinalizeChild(pjobexec_t *);

int TMomCheckJobChild(pjobexec_t *,int,int *,int *);
static int search_env_and_open(const char *,u_long);
extern int TMOMJobGetStartInfo(job *,pjobexec_t **);
extern int mom_reader(int,int);
extern int mom_writer(int,int);
extern int x11_create_display(int, char *,char *phost,int pport,char *homedir,char *x11authstr);


/* END prototypes */


#define FDMOVE(fd) if (fd < 3) { \
	int hold = fcntl(fd,F_DUPFD,3); \
	close(fd); \
	fd = hold; \
        }




/*
 * no_hang() - interrupt handler for alarm() around attempt to connect
 *	to qsub for interactive jobs.   If qsub hung or suspended or if the
 *	network is fouled up, mom cannot afford to wait forever.
 */

static void no_hang(

  int sig)  /* I (not used) */

  {
  LOG_EVENT(
    PBSEVENT_JOB, 
    PBS_EVENTCLASS_REQUEST, 
    " ",
    "alarm timed-out connect to qsub");

  return;
  }  /* END no_hang() */





struct passwd *check_pwd(

  job *pjob)  /* I (modified) */

  {
  struct passwd	*pwdp;
  struct group	*grpp;

  /* NOTE:  should cache entire pwd object (NYI) */

  pwdp = getpwnam(pjob->ji_wattr[(int)JOB_ATR_euser].at_val.at_str);

  if (pwdp == NULL) 
    {
    sprintf(log_buffer,"No Password Entry for User %s",
      pjob->ji_wattr[(int)JOB_ATR_euser].at_val.at_str);
 
    return(NULL);
    }

  if (pjob->ji_grpcache != NULL)
    {
    /* group cache previously loaded and cached */

    return(pwdp);
    }

  pjob->ji_qs.ji_un_type = JOB_UNION_TYPE_MOM;

  pjob->ji_qs.ji_un.ji_momt.ji_exuid = pwdp->pw_uid;

  pjob->ji_grpcache = malloc(sizeof(struct grpcache) + strlen(pwdp->pw_dir) + 1);

  if (pjob->ji_grpcache == NULL) 
    {
    sprintf(log_buffer,"Malloc failed");

    return(NULL);
    }

  strcpy(pjob->ji_grpcache->gc_homedir,pwdp->pw_dir);

  /* get the group and supplimentary under which the job is to be run */

  if ((pjob->ji_wattr[(int)JOB_ATR_egroup].at_flags & 
      (ATR_VFLAG_SET|ATR_VFLAG_DEFLT)) == ATR_VFLAG_SET)  
    {
    /* execution group specified and not default of login group */

    grpp = getgrnam(pjob->ji_wattr[(int)JOB_ATR_egroup].at_val.at_str);

    if (grpp == NULL) 
      {
      sprintf(log_buffer,"No Group Entry for Group %s",
        pjob->ji_wattr[(int)JOB_ATR_egroup].at_val.at_str);

      return(NULL);
      }

    pjob->ji_qs.ji_un.ji_momt.ji_exgid = grpp->gr_gid;
    } 
  else 
    {
    /* default to login group */

    pjob->ji_qs.ji_un.ji_momt.ji_exgid = pwdp->pw_gid;	
    }

  if ((pjob->ji_grpcache->gc_ngroup = init_groups(
        pwdp->pw_name,
        pjob->ji_qs.ji_un.ji_momt.ji_exgid,
        NGROUPS_MAX,
        pjob->ji_grpcache->gc_groups)) < 0) 
    {
    sprintf(log_buffer,"Too many group entries");

    return(NULL);
    }

  /* perform site specific check on validatity of account */

  if (site_mom_chkuser(pjob)) 
    {
    sprintf(log_buffer,"site_mom_chkuser failed");

    return(NULL);
    }

  return(pwdp);
  }  /* END check_pwd() */





int mom_restart_job(

  job  *pjob,
  char *path)

  {
  static char	id[] = "mom_restart_job";
  int		i;
  char		namebuf[MAXPATHLEN];
  char		*filnam;
  DIR		*dir;
  struct	dirent	*pdir;
  tm_task_id	taskid;
  task		*ptask;
  int		tcount = 0;
  long		mach_restart A_((task *, char *path));

  if ((dir = opendir(path)) == NULL) 
    {
    sprintf(log_buffer,"opendir %s", 
      path);
 
    log_err(errno,id,log_buffer);

    return(-1);
    }

  strcpy(namebuf,path);
  strcat(namebuf,"/");

  i = strlen(namebuf);

  filnam = &namebuf[i];

  while ((pdir = readdir(dir)) != NULL) 
    {
    if (strlen(pdir->d_name) <= 2)
      continue;

    if ((taskid = (tm_task_id)atoi(pdir->d_name)) == 0) 
      {
      sprintf(log_buffer, "%s: garbled filename %s",
        pjob->ji_qs.ji_jobid, 
        pdir->d_name);

      goto fail;
      }

    if ((ptask = task_find(pjob, taskid)) == NULL) 
      {
      sprintf(log_buffer, "%s: task %d not found",
        pjob->ji_qs.ji_jobid, 
        (int)taskid);

      goto fail;
      }

    strcpy(filnam,pdir->d_name);

    if (mach_restart(ptask,namebuf) == -1) 
      {
      sprintf(log_buffer, "%s: task %d failed from file %s",
        pjob->ji_qs.ji_jobid, 
        (int)taskid, 
        namebuf);

      goto fail;
      }

    ptask->ti_qs.ti_status = TI_STATE_RUNNING;

    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_ERROR,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "task set to running (mom_restart_job)");
      }
 
    task_save(ptask);

    tcount++;
    }

  closedir(dir);

  return(tcount);

fail:

  log_err(errno,id,log_buffer);

  closedir(dir);

  return(-1);
  }  /* END mom_restart_job() */




void exec_bail(

  job *pjob,  /* I */
  int  code)  /* I */

  {
  /* inform non-MS nodes that job is aborting */

  send_sisters(pjob,IM_ABORT_JOB);  

  pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;
  pjob->ji_qs.ji_un.ji_momt.ji_exitstat = code;

  job_save(pjob,SAVEJOB_QUICK);

  exiting_tasks = 1;
  
  if (pjob->ji_stdout > 0)
    close(pjob->ji_stdout);

  if (pjob->ji_stderr > 0)
    close(pjob->ji_stderr);

  return;
  }  /* END exec_bail() */





#define	RETRY	3

int open_demux(

  u_long addr, /* I */
  int    port) /* I */

  {
  static char id[] = "open_demux";
  int         sock;
  int         i;
  struct sockaddr_in remote;

  remote.sin_addr.s_addr = addr;
  remote.sin_port = htons((unsigned short)port);
  remote.sin_family = AF_INET;

  if ((sock = socket(AF_INET,SOCK_STREAM,0)) == -1) 
    {
    sprintf(log_buffer,"%s: socket %s",
      id,
      netaddr(&remote));

    log_err(errno,id,log_buffer);

    return(-1);
    }

  for (i = 0;i < RETRY;i++) 
    {
    if (connect(sock,(struct sockaddr *)&remote,sizeof(remote)) == 0)
      {
      /* success */

      return(sock);
      }

    switch (errno) 
      {
      case EINTR:
      case ETIMEDOUT:
      case ECONNRESET:

        sleep(2);

        continue;

        /*NOTREACHED*/

        break;

      case EADDRINUSE:
      case ECONNREFUSED:

        sprintf(log_buffer,"%s: cannot connect to %s", 
          id, 
          netaddr(&remote));

        log_err(errno,id,log_buffer);

        sleep(2);

        continue;

        /*NOTREACHED*/

        break;

      default:

        /* NO-OP */

        break;
      }  /* END switch (errno) */

    break;
    }  /* END for (i) */

  sprintf(log_buffer,"%s: connect %s", 
    id, 
    netaddr(&remote));

  log_err(errno,id,log_buffer);

  close(sock);

  return(-1);
  }  /* END open_demux() */




/*
 * open_pty - open slave side of master/slave pty
 */

static int open_pty(

  job *pjob)  /* I */

  {
  char	*name;
  int	 pts;

  /* Open the slave pty as the controlling tty */

  name = pjob->ji_wattr[(int)JOB_ATR_outpath].at_val.at_str;

  if ((pts = open(name,O_RDWR,0600)) < 0) 
    {
    log_err(errno,"open_pty","cannot open slave");
    } 
  else 
    {
    FDMOVE(pts);
	
    fchmod(pts,0620);

    fchown(pts,pjob->ji_qs.ji_un.ji_momt.ji_exuid,
      pjob->ji_qs.ji_un.ji_momt.ji_exgid);

#ifdef SETCONTROLLINGTTY

#if defined(_CRAY) 
    ioctl(0,TCCLRCTTY,0);
    ioctl(pts,TCSETCTTY,0); /* make controlling */
#elif defined(TCSETCTTY)
    ioctl(pts,TCSETCTTY,0); /* make controlling */
#elif defined(TIOCSCTTY)
    ioctl(pts,TIOCSCTTY,0);
#endif 

#endif	/* SETCONTROLLINGTTY */
   }

  return(pts);
  }  /* END open_pty() */





/*
 * is_joined - determine if standard out and stardard error are joined together
 *	(-j option) and if so which is first
 *	Returns: 0 - no join, separate files
 *		+1 - joined as stdout
 *		-1 - joined as stderr
 */

int is_joined(

  job *pjob)  /* I */

  {
  attribute *pattr;

  pattr = &pjob->ji_wattr[(int)JOB_ATR_join];

  if ((pattr->at_flags & ATR_VFLAG_SET) &&
      (pattr->at_val.at_str[0] != 'n')) 
    {
    if ((pattr->at_val.at_str[0] == 'o') &&
        (strchr(pattr->at_val.at_str,(int)'e') != 0) ) 
      {
      return(1);
      } 

    if ((pattr->at_val.at_str[0] == 'e') &&
        (strchr(pattr->at_val.at_str, (int)'e') != 0)) 
      {
      return(-1);
      }
    }

  return(0);
  }  /* END is_joined() */




/* 
 * open_std_out_err - open standard out and err to files
 */

static int open_std_out_err(

  job *pjob)

  {
  int i;
  int file_out = -2;
  int file_err = -2;
  int filemode = O_CREAT | O_WRONLY | O_APPEND | O_EXCL;

  /* if std out/err joined (set and != "n"), which file is first */
	
  i = is_joined(pjob);

  if (i == 1) 
    {
    file_out = open_std_file(
      pjob,
      StdOut,
      filemode,
      pjob->ji_qs.ji_un.ji_momt.ji_exgid);

    file_err = dup(file_out);
    } 
  else if (i == -1) 
    {
    file_err = open_std_file(
      pjob, 
      StdErr, 
      filemode,
      pjob->ji_qs.ji_un.ji_momt.ji_exgid);

    file_out = dup(file_err);
    }

  if (file_out == -2)
    file_out = open_std_file(
      pjob, 
      StdOut, 
      filemode,
      pjob->ji_qs.ji_un.ji_momt.ji_exgid);

  if (file_err == -2)
    file_err = open_std_file(
      pjob, 
      StdErr, 
      filemode,
      pjob->ji_qs.ji_un.ji_momt.ji_exgid);

  if ((file_out < 0) || (file_err < 0)) 
    {
    log_err(errno,"open_std_out_err",
      "Unable to open standard output/error");

    return(-1);
    }

  FDMOVE(file_out);	/* make sure descriptor > 2       */
  FDMOVE(file_err);	/* so don't clobber stdin/out/err */

  if (file_out != 1) 
    {
    close(1);
    dup(file_out);
    close(file_out);
    }

  if (file_err != 2) 
    {
    close(2);
    dup(file_err);
    close(file_err);
    }

  return(0);
  }  /* END open_std_out_err() */


int mkdirtree(

  char *dirpath, /* I */
  mode_t mode)   /* I */

  {

  char *part;
  int rc = 0;
  mode_t oldmask = 0;
  char *path = NULL;

  if (*dirpath != '/')
    {
    rc=-1;

    goto done;
    }

  /* make a copy to scribble NULLs on */
  if ((path=strdup(dirpath)) == NULL)
    {
    rc=-1;

    goto done;
    }

  oldmask=umask(0000);

  part=strtok(path,"/");
  if (part == NULL)
    {
    rc=-1;

    goto done;
    }
  *(part-1)='/';  /* leading / */

  while((part = strtok(NULL,"/")) != NULL)
    {
    if (mkdir(path,mode) == -1)
      {
      if (errno != EEXIST)
        {
        rc=errno;

        goto done;
        }
      }

    *(part-1)='/';
    }

  /* very last component */
  if (mkdir(path,mode) == -1)
    {
    if (errno != EEXIST)
      {
      rc=errno;

      goto done;
      }
    }

done:

  if (oldmask != 0)
    umask(oldmask);

  if (path != NULL)
    free(path);

  return(rc);
}

  

/* If our config allows it, construct tmpdir path */
int TTmpDirName(

  job  *pjob,   /* I */
  char *tmpdir) /* O */

  {

  if (tmpdir_basename[0] == '/')
    {
    snprintf(tmpdir,
      MAXPATHLEN,
      "%s/%s",
      tmpdir_basename,
      pjob->ji_qs.ji_jobid);
    }
  else
    {
    *tmpdir='\0';
    }

  return(*tmpdir != '\0');  /* return "true" if tmpdir is set */
  }


int TMakeTmpDir(

  job  *pjob,   /* I */
  char *tmpdir) /* I */
  {

  char id[]="TMakeTmpDir";
  int			rc;
  int			retval;
  struct stat		sb;

#if defined(HAVE_SETEUID) && defined(HAVE_SETEGID)
  if ((setegid(pjob->ji_qs.ji_un.ji_momt.ji_exgid) == -1) ||
      (seteuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid) == -1))
#elif defined(HAVE_SETRESUID) && defined(HAVE_SETRESGID)
  if ((setresgid(-1,pjob->ji_qs.ji_un.ji_momt.ji_exgid,-1) == -1) ||
      (setresuid(-1,pjob->ji_qs.ji_un.ji_momt.ji_exuid,-1) == -1))
#endif
    {
    return(0);
    }

  retval=mkdirtree(tmpdir,0755);

  if (retval == 0)
    {
    /* We made it, it's ours */
    pjob->ji_flags |= MOM_HAS_TMPDIR;
    }
  else
    {
    rc=stat(tmpdir,&sb);

    if (rc)
      rc=errno;

    switch (rc)
      {
      case ENOENT:

        sprintf(log_buffer,
          "Unable to make job transient directory: %s",
          tmpdir);

        break;

      case 0:

        if(S_ISDIR(sb.st_mode))
          {
          if (sb.st_uid == pjob->ji_qs.ji_un.ji_momt.ji_exuid)
            {
            retval=0;  /* owned by the job, allowed */
            }
          else
            {
            sprintf(log_buffer,
              "Job transient tmpdir %s already exists, owned by %d",
              tmpdir,
              sb.st_uid);

            retval=-1;
            }
          }
        else
          {
          sprintf(log_buffer,
            "Job transient tmpdir %s exists, but is not a directory",
            tmpdir);

          retval=-1;
          }

      break;

    default:

      sprintf(log_buffer,
        "Cannot name job tmp directory %s (on stat)",
        tmpdir);

      return(0);

      break;
    }
  }

#if defined(HAVE_SETEUID) && defined(HAVE_SETEGID)
  seteuid(0);
  setegid(pbsgroup);
#elif defined(HAVE_SETRESUID) && defined(HAVE_SETRESGID)
  setresuid(-1,0,-1);
  setresgid(-1,pbsgroup,-1);
#endif  /* HAVE_SETRESUID */

  if (retval != 0)
    log_err(retval,id,log_buffer);

  return(retval == 0);  /* return boolean */
  }



/* Sets up env for a user process, used by TMomFinalizeJob1, start_process,
 * and file copies */

int InitUserEnv(

  job            *pjob,   /* I */
  task           *ptask,  /* I (optional) */
  char          **envp,   /* I (optional) */
  struct passwd  *pwdp,   /* I (optional) */
  char           *shell)  /* I (optional) */

  {
  char id[]="InitUserEnv";

  struct array_strings *vstrs;
  int j=0;
  int ebsize=0;
  char  buf[MAXPATHLEN + 2];
  int usertmpdir=0;

  if (pjob == NULL)
    {
    sprintf(log_buffer,"passed a NULL pjob!");

    log_err(errno,id,log_buffer);

    return(-1);
    }

  /* initialize vtable */

  if (envp != NULL)
    {
    for (j = 0,ebsize = 0;envp[j]; j++)
      ebsize += strlen(envp[j]);
    }

  vstrs = pjob->ji_wattr[(int)JOB_ATR_variables].at_val.at_arst;

  vtable.v_bsize = ebsize + EXTRA_VARIABLE_SPACE +
                     (vstrs != NULL ? (vstrs->as_next - vstrs->as_buf) : 0);

  vtable.v_block = malloc(vtable.v_bsize);

  if (vtable.v_block == NULL)
    {
    sprintf(log_buffer,"PBS: failed to init env, malloc: %s\n",
      strerror(errno));

    log_err(errno,id,log_buffer);

    return(-1);
    }

  vtable.v_ensize = num_var_else + num_var_env + j + EXTRA_ENV_PTRS +
                      (vstrs != NULL ? vstrs->as_usedptr : 0);

  vtable.v_used = 0;

  vtable.v_envp = malloc(vtable.v_ensize * sizeof(char *));

  if (vtable.v_envp == NULL)
    {
    sprintf(log_buffer,"PBS: failed to init env, malloc: %s\n",
      strerror(errno));

    log_err(errno,id,log_buffer);

    return(-1);
    }

  /* First variables from the local environment */

  for (j = 0;j < num_var_env;++j)
    bld_env_variables(&vtable,environ[j],NULL);

  /* Next, the variables passed with the job.  They may   */
  /* be overwritten with new correct values for this job        */

  if (vstrs != NULL)
    {
    for (j = 0;j < vstrs->as_usedptr;++j)
      {
      bld_env_variables(&vtable,vstrs->as_string[j],NULL);

      if (!strncmp(vstrs->as_string[j],variables_else[12],strlen(variables_else[12])))
        usertmpdir = 1;
      }
    }

  /* HOME */

  if (pjob->ji_grpcache != NULL)
    bld_env_variables(&vtable,variables_else[0],pjob->ji_grpcache->gc_homedir);
  
  /* LOGNAME */

  if (pwdp != NULL)
    bld_env_variables(&vtable,variables_else[1],pwdp->pw_name);

  /* PBS_JOBNAME */

  bld_env_variables(
    &vtable,
    variables_else[2],
    pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str);
  
  /* PBS_JOBID */

  bld_env_variables(&vtable,variables_else[3],pjob->ji_qs.ji_jobid);
  
  /* PBS_QUEUE */

  bld_env_variables(
    &vtable,
    variables_else[4],
    pjob->ji_wattr[(int)JOB_ATR_in_queue].at_val.at_str);
  
  /* SHELL */

  if (shell != NULL)
    bld_env_variables(&vtable,variables_else[5],shell);

  /* USER, for compatability */

  if (pwdp != NULL)
    bld_env_variables(&vtable,variables_else[6],pwdp->pw_name);

  /* PBS_JOBCOOKIE */                                                                          
                                                                                               
  bld_env_variables(
    &vtable,
    variables_else[7],
    pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str);

  /* PBS_NODENUM */

  sprintf(buf,"%d",
    pjob->ji_nodeid);

  bld_env_variables(&vtable,variables_else[8],buf);

  /* PBS_TASKNUM */

  if (ptask != NULL)
    {
    sprintf(buf,"%d",
      (int)ptask->ti_qs.ti_task);

    bld_env_variables(&vtable,variables_else[9],buf);
    }

  /* PBS_MOMPORT */

  sprintf(buf,"%d",
    pbs_rm_port);

  bld_env_variables(&vtable,variables_else[10],buf);

  /* PBS_NODEFILE */

  if (pjob->ji_flags & MOM_HAS_NODEFILE)
    {
    sprintf(buf,"%s/%s",
      path_aux,
      pjob->ji_qs.ji_jobid);

    bld_env_variables(&vtable,variables_else[11],buf);
    }

  /* setup TMPDIR */

  if (!usertmpdir && TTmpDirName(pjob,buf))
    bld_env_variables(&vtable,variables_else[12],buf);

  /* passed-in environment for tasks */

  if (envp != NULL)
    {
    for (j = 0;envp[j];j++)
      bld_env_variables(&vtable,envp[j],NULL);
    }

  return(0);
  }  /* END InitUserEnv() */





/*
 * Used by MOM superior to start the shell process.
 * perform all server level pre-job tasks, collect information
 * create parent-child pipes 
 */

int TMomFinalizeJob1(

  job        *pjob,  /* I (modified) */
  pjobexec_t *TJE,   /* O */
  int        *SC)    /* O */

  {
  static char 	       *id = "TMomFinalizeJob1";

  torque_socklen_t	         slen;

  int                    i;

  attribute		*pattr;
  attribute		*pattri;
  resource		*presc;
  resource_def		*prd;
  struct sockaddr_in     saddr;

#if MOM_CHECKPOINT == 1
  char	   		buf[MAXPATHLEN + 2];
  struct stat		sb;
#endif /* MOM_CHECKPOINT */

#if defined(PENABLE_CPUSETS) || defined(PENABLE_DYNAMIC_CPUSETS)
  char                  cQueue[16];
#endif  /* (PENABLE_CPUSETS || PENABLE_DYNAMIC_CPUSETS) */

  *SC = 0;

  if (TJE == NULL)
    {
    sprintf(log_buffer,"bad param in %s",
      id);

    *SC = JOB_EXEC_RETRY;
    
    return(FAILURE);
    }

  /* initialize job exec struct */

  memset(TJE,0,sizeof(pjobexec_t));

  TJE->ptc = -1;

  TJE->pjob = (void *)pjob;

  /* prepare job environment */

  if (pjob->ji_numnodes > 1) 
    {
    /*
    ** Get port numbers from file decriptors in job struct.  The
    ** sockets are stored there so they can be closed later as
    ** Main MOM will not need them after the job is going.
    */

    slen = sizeof(saddr);

    if (getsockname(
          pjob->ji_stdout,
          (struct sockaddr *)&saddr,
          &slen) == -1) 
      {
      sprintf(log_buffer,"getsockname on stdout");
 
      *SC = JOB_EXEC_RETRY;
     
      return(FAILURE);
      }

    TJE->port_out = (int)ntohs(saddr.sin_port);
	
    slen = sizeof(saddr);

    if (getsockname(
         pjob->ji_stderr,
         (struct sockaddr *)&saddr,
         &slen) == -1) 
      {
      sprintf(log_buffer,"getsockname on stderr");
 
      *SC = JOB_EXEC_RETRY;

      return(FAILURE);
      }

    TJE->port_err = (int)ntohs(saddr.sin_port);
    } 
  else 
    {
    TJE->port_out = -1;
    TJE->port_err = -1;
    }

  /* did the job request nodes?  will need to setup node file */

  pattr = &pjob->ji_wattr[(int)JOB_ATR_resource];

  prd = find_resc_def(svr_resc_def,"neednodes",svr_resc_size);

  presc = find_resc_entry(pattr,prd);

  if (presc != NULL) 
    pjob->ji_flags |= MOM_HAS_NODEFILE;

  /*
   * get the password entry for the user under which the job is to be run
   * we do this now to save a few things in the job structure
   */

  if ((TJE->pwdp = (void *)check_pwd(pjob)) == NULL) 
    {
    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);
 
    *SC = JOB_EXEC_FAIL1;

    return(FAILURE);
    }

#if IBM_SP2==2        /* IBM SP with PSSP 3.1 */

  /* load IBM SP switch table */

  if (load_sp_switch(pjob) != 0) 
    {
    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    *SC = JOB_EXEC_RETRY;

    return(FAILURE);
    }

#endif	/* IBM SP */

  /*
   * if certain resource limits require that the job usage be
   * polled or it is a multinode job, we link the job to mom_polljobs.
   *
   * NOTE: we overload the job field ji_jobque for this as it
   * is not used otherwise by MOM
   */

  if ((pjob->ji_numnodes > 1) || (mom_do_poll(pjob) != 0))
    append_link(&mom_polljobs,&pjob->ji_jobque,pjob);

#if MOM_CHECKPOINT == 1

  /* Is the job to be periodically checkpointed */

  pattr = &pjob->ji_wattr[(int)JOB_ATR_chkpnt];

  if ((pattr->at_flags & ATR_VFLAG_SET) &&
      (*pattr->at_val.at_str == 'c') &&
      (*(pattr->at_val.at_str + 1) == '=')) 
    {
    /* has checkpoint time (in minutes), convert to milliseconds */

    pjob->ji_chkpttime = atoi(pattr->at_val.at_str + 2) * 60;
    pjob->ji_chkptnext = pjob->ji_chkpttime;
    }

  /* If job has been checkpointed, restart from the checkpoint image */

  strcpy(buf,path_checkpoint);
  strcat(buf,pjob->ji_qs.ji_fileprefix);
  strcat(buf,JOB_CKPT_SUFFIX);

  if (((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHKPT) || 
       (pjob->ji_qs.ji_svrflags & JOB_SVFLG_ChkptMig)) &&
       (stat(buf,&sb) == 0)) 
    {
    /* Checkpointed - restart from checkpoint file */

    /* perform any site required setup before restart */

    if ((i = site_mom_prerst(pjob)) != 0) 
      {
      pjob->ji_qs.ji_un.ji_momt.ji_exitstat = i;

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;

      exiting_tasks = 1;

      sprintf(log_buffer,"Pre-restart failed %d",
        errno);

      LOG_EVENT(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);
      /* FIXME: do we need to return failure at this point? */
      }

    if ((i = mom_restart_job(pjob,buf)) > 0) 
      {
      sprintf(log_buffer,"Restarted %d tasks",
        i);

      LOG_EVENT(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);

      /* reset mtime so walltime will not include held time */
      /* update to time now minus the time already used	  */
      /* unless it is suspended, see request.c/req_signal() */

      time_now = time(0);

      if ((pjob->ji_qs.ji_svrflags & JOB_SVFLG_Suspend) == 0) 
        {
        pjob->ji_qs.ji_stime = 
          time_now - (sb.st_mtime - pjob->ji_qs.ji_stime);

        pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;

        if (mom_get_sample() != PBSE_NONE)
          mom_set_use(pjob);
        } 
      else 
        {
        pjob->ji_qs.ji_substate = JOB_SUBSTATE_SUSPEND;
        }
      } 
    else 
      {
      /* FAILURE */
	
      /* retry for any kind of changable thing */

      if ((errno == EAGAIN) ||

#ifdef	ERFLOCK
          (errno == ERFLOCK) ||
#endif
#ifdef	EQUSR
          (errno == EQUSR) ||
#endif
#ifdef	EQGRP
          (errno == EQGRP) ||
#endif
#ifdef	EQACT
          (errno == EQACT) ||
#endif
#ifdef	ENOSDS
          (errno == ENOSDS) ||
#endif
          (errno == ENOMEM) ||
          (errno == ENOLCK) ||
          (errno == ENOSPC) ||
          (errno == ENFILE) ||
          (errno == EDEADLK) ||
          (errno == EBUSY))
        {
        pjob->ji_qs.ji_un.ji_momt.ji_exitstat = JOB_EXEC_RETRY;
        }
      else 
        {
        pjob->ji_qs.ji_un.ji_momt.ji_exitstat = JOB_EXEC_BADRESRT;
        }

      pjob->ji_qs.ji_substate = JOB_SUBSTATE_EXITING;

      exiting_tasks = 1;

      sprintf(log_buffer,"Restart failed, error %d",
        errno);

      LOG_EVENT(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);
      }  /* END else (mom_restart_job() == SUCCESS) */

    /* NOTE:  successful checkpoint handling routes through here */

    *SC = 0;

    return(FAILURE);
    }  /* END (((pjob->ji_qs.ji_svrflags & JOB_SVFLG_CHKPT) || ...) */

#endif	/* MOM_CHECKPOINT */

  pattri = &pjob->ji_wattr[(int)JOB_ATR_interactive];

  if ((pattri->at_flags & ATR_VFLAG_SET) &&
      (pattri->at_val.at_long != 0)) 
    {
    TJE->is_interactive = TRUE;
    }
  else
    {
    TJE->is_interactive = FALSE;
    }

  if (TJE->is_interactive == TRUE)
    {
    /*
     * open a master pty, need to do it here before we fork,
     * to save the slave name in the master's job structure
     */

    if ((TJE->ptc = open_master(&TJE->ptc_name)) < 0) 
      {
      log_err(errno,id,"cannot open master pty");

      *SC = JOB_EXEC_RETRY;

      return(FAILURE);
      }

    FDMOVE(TJE->ptc)

    /* save pty name in job output/error file name */

    pattr = &pjob->ji_wattr[(int)JOB_ATR_outpath];

    job_attr_def[(int)JOB_ATR_outpath].at_free(pattr);

    job_attr_def[(int)JOB_ATR_outpath].at_decode(
      pattr, 
      NULL, 
      NULL, 
      TJE->ptc_name);

    pjob->ji_wattr[(int)JOB_ATR_outpath].at_flags =
      (ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_SEND);

    pattr = &pjob->ji_wattr[(int)JOB_ATR_errpath];

    job_attr_def[(int)JOB_ATR_errpath].at_free(pattr);

    job_attr_def[(int)JOB_ATR_errpath].at_decode(
      pattr, 
      NULL, 
      NULL, 
      TJE->ptc_name);

    pjob->ji_wattr[(int)JOB_ATR_errpath].at_flags =
      (ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_SEND);

    }  /* END if (TJE->is_interactive == TRUE) */

#if SHELL_USE_ARGV == 0
#if SHELL_INVOKE == 1

  if (TJE->is_interactive == FALSE)
    {
    /* need a pipe on which to write the shell script   */
    /* file name to the input of the shell                      */

    if (pipe(TJE->pipe_script) == -1)
      {
      sprintf(log_buffer,
        "Failed to create shell name pipe");

      LOG_EVENT(
        PBSEVENT_JOB,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);

      *SC = JOB_EXEC_RETRY;

      return(FAILURE);
      }
    }    /* END if (TJE->is_interactive == FALSE) */

#endif /* SHELL_INVOKE */
#endif /* !SHELL_USE_ARGV */

  /* create pipes between MOM and the job starter   */
  /* fork the job starter which will become the job */

  if ((pipe(TJE->mjspipe) == -1) || (pipe(TJE->jsmpipe) == -1))
    {
    i = -1;
    }
  else
    {
    i = 0;

    /* make sure pipe file descriptors are above 2 */

    if (TJE->jsmpipe[1] < 3)
      {
      TJE->upfds = fcntl(TJE->jsmpipe[1],F_DUPFD,3);

      close(TJE->jsmpipe[1]);

      TJE->jsmpipe[1] = 0;
      }
    else
      {
      TJE->upfds = TJE->jsmpipe[1];
      }

    if (TJE->mjspipe[0] < 3)
      {
      TJE->downfds = fcntl(TJE->mjspipe[0],F_DUPFD,3);

      close(TJE->mjspipe[0]);

      TJE->mjspipe[0] = 0;
      }
    else
      {
      TJE->downfds = TJE->mjspipe[0];
      }
    }

  if ((i == -1) || (TJE->upfds < 3) || (TJE->downfds < 3))
    {
    sprintf(log_buffer,"cannot create communication pipe");

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    *SC = JOB_EXEC_RETRY;

    return(FAILURE);
    }

  if ((TJE->ptask = (void *)pbs_task_create(pjob,TM_NULL_TASK)) == NULL)
    {
    sprintf(log_buffer,"cannot create job task");

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);

    *SC = JOB_EXEC_RETRY;

    return(FAILURE);
    }

  pjob->ji_qs.ji_substate = JOB_SUBSTATE_STARTING;

  pjob->ji_qs.ji_stime = time_now;

  return(SUCCESS);
  }  /* END TMomFinalizeJob1() */





/* fork child/prolog */

int TMomFinalizeJob2(

  pjobexec_t *TJE,   /* I */
  int        *SC)    /* O */

  {
  static char          *id = "TMomFinalizeJob2";

  char                  buf[MAXPATHLEN + 2];
  pid_t                 cpid;
  int                   i, j;

  job                  *pjob;
  task                 *ptask;

  pjob  = (job *)TJE->pjob;
  ptask = (task *)TJE->ptask;

  /*
  ** fork the child that will become the job.
  */

  if ((cpid = fork_me(-1)) < 0)
    {
    /* fork failed */

    sprintf(log_buffer,"fork of job '%s' failed in (errno=%d, '%s')",
      pjob->ji_qs.ji_jobid,
      errno,
      strerror(errno));

    log_record(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      id,
      log_buffer);

    *SC = JOB_EXEC_RETRY;

    return(FAILURE);
    }

  if (cpid == 0) 
    {
    /* CHILD:  handle child activities */

    TMomFinalizeChild(TJE);

    /*NOTREACHED*/
    }

  /* parent */

  close(TJE->upfds);
  close(TJE->downfds);

  if (TJE->ptc >= 0)
    close(TJE->ptc);

  strcpy(buf,path_jobs);
  strcat(buf,pjob->ji_qs.ji_fileprefix);
  strcat(buf,JOB_SCRIPT_SUFFIX);

  chown(
    buf, 
    pjob->ji_qs.ji_un.ji_momt.ji_exuid,
    pjob->ji_qs.ji_un.ji_momt.ji_exgid);

#if SHELL_USE_ARGV == 0
#if SHELL_INVOKE == 1

  if (TJE->is_interactive == FALSE) 
    {
    int k;

    /* pass name of shell script on pipe */
    /* will be stdin of shell 	*/
	
    close(TJE->pipe_script[0]);
    strcat(buf,"\n");	      /* setup above */

    i = strlen(buf);
    j = 0;

    while (j < i) 
      {
      if ((k = write(TJE->pipe_script[1],buf + j,i - j)) < 0) 
        {
        if (errno == EINTR)
          continue;

        break;
        }

      j += k;
      }	

    close(TJE->pipe_script[1]);
    }

#endif	/* SHELL_INVOKE */
#endif  /* !SHELL_USE_ARGV */

  /* SUCCESS:  parent returns */

  if (LOGLEVEL >= 3)
    {
    sprintf(log_buffer,"phase 2 of job launch successfully completed");

    log_record(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);
    }

  *SC = 0;

  return(SUCCESS);
  }  /* END TMomFinalizeJob2() */





/* child portion of job launch executed as user - called by TMomFinalize2() */

int TMomFinalizeChild(

  pjobexec_t *TJE)   /* I */

  {
  static char          *id = "TMomFinalizeChild";

  char                 *arg[3];
  char                  buf[MAXPATHLEN + 2];
  pid_t                 cpid;
  int                   i, j, vnodenum;
  char                 *phost = NULL;
  int                   pport = 0;
  int                   pts;
  int                   qsub_sock;
  char                  *shell;
  char                  *shellname;
  char                  *idir;
  char                  *termtype;

  struct startjob_rtn   sjr = {0,0};

#if defined(PENABLE_DYNAMIC_CPUSETS)

  attribute            *pattr;
  char                  cQueueName[16];  /* Unique CpuSet Name */
  char                  cPermFile[1024]; /* Unique File Name */
  FILE                  *fp;            /* file pointer into /proc/cpuinfo */
  char                  cBuffer[CBUFFERSIZE + 1];  /* char buffer used for counting procs */
  int                   nCPUS = 0;              /* Number of cpus the machine has */
  int                   nCpuId = 0;             /* CpuId */

  struct CpuSetMap {
    short CpuId;
    char  cQueueName[16]; /* Data struct for mapping CpuId to
                             CpuSets assignments for the machine */
  } *cpusetMap;

  cpuset_NameList_t     *cpusetList;    /* List of all cpusets defined on machine */
  cpuset_CPUList_t      *cpuList;       /* List of Cpus assigned to one CpuSet */
  cpuset_QueueDef_t     *cpuQdef;       /* CpuSet Definition */
  resource              *presc;         /* Requested Resource List */
  resource_def          *prd;

#endif  /* PENABLE_DYNAMIC_CPUSETS */

  job                  *pjob;
  task                 *ptask;

  struct passwd        *pwdp;

  pjob  = (job *)TJE->pjob;
  ptask = (task *)TJE->ptask;

  pwdp  = (struct passwd *)TJE->pwdp;

  /*******************************************/
  /*                                         */
  /* The child process - will become the job */
  /*                                         */
  /*******************************************/

  if (lockfds >= 0)
    {
    close(lockfds);
  
    lockfds = -1;
    }

  close(TJE->jsmpipe[0]);
  close(TJE->mjspipe[1]);

  /*
   * find which shell to use, one specified or the login shell
   */

  shell = set_shell(pjob,pwdp);	/* in the machine dependent section */

  /* Setup user env */

  if (InitUserEnv(pjob,ptask,NULL,pwdp,shell) < 0)
    {
    log_err(-1,id,"failed to setup user env");

    starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_RETRY,&sjr);
    }

  /* Create the job's nodefile */

  vnodenum = pjob->ji_numvnod;

  if (pjob->ji_flags & MOM_HAS_NODEFILE) 
    {
    FILE *nhow;

    char *BPtr;

    sprintf(buf,"%s/%s",
      path_aux, 
      pjob->ji_qs.ji_jobid);

    if ((nhow = fopen(buf,"w")) == NULL) 
      {
      sprintf(log_buffer,"cannot open %s",
        buf);

      log_err(errno,id,log_buffer);

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);
      }

    /*
    **	The file must be owned by root and readable by
    **	the user.  We take the easy way out and make
    **	it readable by anyone.
    */

    if (fchmod(fileno(nhow),0644) == -1) 
      {
      sprintf(log_buffer,"cannot chmod %s",
        buf);

      log_err(errno,id,log_buffer);

      fclose(nhow);

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);
      }

    /* NOTE:  if BEOWULF_JOB_MAP is set, populate node file with this info */

    BPtr = get_job_envvar(pjob,"BEOWULF_JOB_MAP");

    if (BPtr != NULL)
      {
      char tmpBuffer[1000000];

      char *ptr;

      /* FORMAT:  <HOST>[:<HOST>]... */

      strncpy(tmpBuffer,BPtr,sizeof(tmpBuffer));

      ptr = strtok(tmpBuffer,":");

      while (ptr != NULL)
        {
        if (nodefile_suffix != NULL)
          {
          fprintf(nhow,"%s%s\n",
            ptr,
            nodefile_suffix);
          }
        else
          {
          fprintf(nhow,"%s\n",
            ptr);
          }

        ptr = strtok(NULL,":");
        }
      }
    else
      {
      for (j = 0;j < vnodenum;j++)
        {
        vnodent *vp = &pjob->ji_vnods[j];

        if (nodefile_suffix != NULL)
          {
          fprintf(nhow,"%s%s\n",
            vp->vn_host->hn_host,
            nodefile_suffix);
          }
        else
          {
          fprintf(nhow,"%s\n",
            vp->vn_host->hn_host);
          }
        }   /* END for (j) */
      }

  fclose(nhow);
  }  /* END if (pjob->ji_flags & MOM_HAS_NODEFILE) */

  if (LOGLEVEL >= 10)
    log_err(-1,id,"node file created");

  /* Set PBS_VNODENUM */

  sprintf(buf,"%d",0);

  bld_env_variables(&vtable,"PBS_VNODENUM",buf);

#if defined(PENABLE_CPUSETS) || defined(PENABLE_DYNAMIC_CPUSETS)

#ifdef PENABLE_DYNAMIC_CPUSETS

  /* create dynamic cpuset(s) */

  /* Get the number of Active CPUs in the system. */

  if ((fp = fopen( "/proc/cpuinfo","r")) == NULL) 
    {
    sprintf(log_buffer,"cannot open /proc/cpuinfo");

    log_err(errno,id,log_buffer);
    }

  /* Look for each instance of "processor" in cpuinfo */

  while (fgets(cBuffer,CBUFFERSIZE,fp)) 
    {
    if (!strncmp(cBuffer,"processor  :",strlen("processor  :")))
      nCPUS++;
    }

  /* Reset the file pointer, so we can parse the cpuinfo
     file again below */

  rewind(fp);

  /* Allocate memory for the CPU-CpuSet map. */

  cpusetMap = (struct CpuSetMap *)malloc(nCPUS * sizeof(struct CpuSetMap));

  if (cpusetMap == NULL) 
    {
    sprintf(log_buffer,"cannot allocate memory for CpuSetMap struct");

    log_err(errno,id,log_buffer);
    }
  else 
    {
    memset(cpusetMap,0,(nCPUS * sizeof(struct CpuSetMap)));

    /* Create map of CPU-CpuSet assignments */

    while (fgets(cBuffer,CBUFFERSIZE,fp)) 
      {
      if (!strncmp(cBuffer,"processor  :",strlen("processor  :")))
        {
        sscanf(&cBuffer[12],"%d", 
          &nCpuId);

        cpusetMap[nCpuId].CpuId = nCpuId;
        }
      }    /* END while() */

    if (!(cpusetList = cpusetGetNameList())) 
      {
      sprintf(log_buffer,"cannot get CpuSet NameList");

      log_err(errno,id,log_buffer);
      }
    else 
      {
      /* Get the list of CPUs in each CpuSet. */

      for (i = 0;i < cpusetList->count;i++) 
        {
        if (!(cpuList = cpusetGetCPUList(cpusetList->list[i]))) 
          {
          sprintf(log_buffer,"cannot get cpuList");

          log_err(errno,id,log_buffer);
          }

        /* Copy the queue name into each used CPU in the CPU-job map. */

        for (j = 0;j < cpuList->count;j++)
          {
          /* CpuSet Name = cpusetList->list[i] */

          strncpy(cpusetMap[cpuList->list[j]].cQueueName,cpusetList->list[i],8);
          cpusetMap[cpuList->list[j]].cQueueName[8] = '\0';
          }
        }    /* END for (i) */
      }      /* END else */
    }        /* END else */

  fclose(fp);

  /* Determine the number of cpus to insert into the cpuset from the request */

  /* TODO: nodes */

  /*
  pattr = &pjob->ji_wattr[(int)JOB_ATR_resource];
  prd = find_resc_def(svr_resc_def,"nodes",svr_resc_size);
  presc = find_resc_entry(pattr,prd);

  if (presc != NULL)
    {
    printf ("nodes = %s\n", 
      presc->rs_value.at_val.at_str);
    }
  */

  /* TODO: neednodes */

  /*
  pattr = &pjob->ji_wattr[(int)JOB_ATR_resource];
  prd = find_resc_def(svr_resc_def,"neednodes",svr_resc_size);
  presc = find_resc_entry(pattr,prd);

  if (presc != NULL)
    {
    printf ("neednodes = %s\n", 
      presc->rs_value.at_val.at_str);
    }
  */

  /* ncpus */

  pattr = &pjob->ji_wattr[(int)JOB_ATR_resource];
  prd = find_resc_def(svr_resc_def,"ncpus",svr_resc_size);
  presc = find_resc_entry(pattr,prd);

  if (presc != NULL) 
    {
    /* Allocating cpuset definition using the ncpus attribute */

    cpuQdef = cpusetAllocQueueDef(presc->rs_value.at_val.at_long);

    /* strncat(cQueueName,pwdp->pw_name,3); */

    /* Queue Name can only be 3 - 8 chars long */

    strncpy(cQueueName,pwdp->pw_name,3);
    cQueueName[3] = '\0';

    strncat(cQueueName,pjob->ji_qs.ji_jobid,5);
    cQueueName[8] = '\0';

    /* Set Memory Affinity */

    cpuQdef->flags = CPUSET_CPU_EXCLUSIVE | CPUSET_MEMORY_LOCAL;

    /* Setting the number of cpus in the cpuset to what was requested by ncpus */

    cpuQdef->cpu->count = presc->rs_value.at_val.at_long;

    strcpy(cPermFile,PBS_SERVER_HOME);
    strcat(cPermFile,"/mom_priv/jobs/");
    strcat(cPermFile,cQueueName);
    strcat(cPermFile,".CS");
    cpuQdef->permfile = cPermFile;

    /* write cpuset definition file */

    if ((fp = fopen(cpuQdef->permfile,"w")) == NULL) 
      {
      sprintf(log_buffer,"cannot create cpuset defintion file");

      log_err(errno,id,log_buffer);
      }

    /* Comment Header, see cpuset(4) */

    fprintf(fp,"#CPUSET CONFIGURATION FILE\n");

    /* First Come, First Server when assigning Cpus to a new CpuSet */

    j = 0;

#ifdef CPUSETS_FIRST_CPU
    for (i = CPUSETS_FIRST_CPU;i < nCPUS;i++) 
#else
    for (i = 0;i < nCPUS;i++) 
#endif
      {
      if (j >= presc->rs_value.at_val.at_long)
        break;

      printf ("%d %s\n",
        i,
        cpusetMap[i].cQueueName);

      if (!strlen(cpusetMap[i].cQueueName)) 
        {
        cpuQdef->cpu->list[j++] = cpusetMap[i].CpuId;

        fprintf(fp,"CPU %d",
          cpusetMap[i].CpuId);
        }
      }    /* END for (i) */

    fclose(fp);

    /* Set the permissions to the definition file */

    if (chmod(cpuQdef->permfile,0700) != 0)
      {
      sprintf(log_buffer,"cannot chmod perm file");

      log_err(errno,id,log_buffer);
      }

    /* Chown the definition file to the user */

    if (chown(
          cpuQdef->permfile,
          pjob->ji_qs.ji_un.ji_momt.ji_exuid, 
          pjob->ji_qs.ji_un.ji_momt.ji_exgid) != 0) 
      {
      sprintf(log_buffer,"cannot chown perm file");

      log_err(errno,id,log_buffer);
      }

    /* Create the cpuset */

    if (!cpusetCreate(cQueueName,cpuQdef)) 
      {
      sprintf(log_buffer,"cannot create cpuset definition");

      log_err(errno,id,log_buffer);
      }

    /* Attach this process & all children processes to the cpuset */

    if (!cpusetAttach(cQueueName)) 
      {
      sprintf(log_buffer,"cannot attach cpuset definition");

      log_err(errno,id,log_buffer);
      }
    }

  /* Clean up dynamic structures */

  cpusetFreeNameList(cpusetList);
  cpusetFreeCPUList(cpuList);
  cpusetFreeQueueDef(cpuQdef);
  memset(cQueueName,0,sizeof(cQueueName));
  free(cpusetMap);

#else  /* PENABLE_DYNAMIC_CPUSETS */

  /* NO-OP */

#endif  /* PENABLE_DYNAMIC_CPUSETS */

#endif  /* (PENABLE_CPUSETS || PENABLE_DYNAMIC_CPUSETS) */


#ifdef ENABLE_CPA
  /* Cray CPA setup */

  if ((j = CPACreatePartition(pjob,&vtable)) != 0)
    {
    log_err(-1,id,"CPACreatePartition failed");

    starter_return(TJE->upfds,TJE->downfds,j,&sjr);	/* exits */
    }
#endif

  /* specific system related variables */

  j = set_mach_vars(pjob,&vtable);

  if (j != 0) 
    {
    log_err(-1,id,"failed to set mach vars");

    starter_return(TJE->upfds,TJE->downfds,j,&sjr);	/* exits */
    }
	
  umask(077);

  if (TJE->is_interactive == TRUE) 
    {
    struct sigaction act;

    /*************************************************************/
    /*	We have an "interactive" job, connect the standard	 */
    /*	streams to a socket connected to qsub.			 */
    /*************************************************************/

    sigemptyset(&act.sa_mask);
#ifdef SA_INTERRUPT
    act.sa_flags   = SA_INTERRUPT;
#else
    act.sa_flags   = 0;
#endif /* SA_INTERRUPT */
    act.sa_handler = no_hang;

    sigaction(SIGALRM,&act,(struct sigaction *)0);

    /* only giving ourselves 5 seconds to connect to qsub
     * and get term settings */
    alarm(5);

    /* once we connect to qsub and open a pty, the user can send us
     * a ctrl-c.  It is important that we block this until we exec()
     * the user's shell or we exit and the job gets stuck */

    act.sa_handler = SIG_IGN;

    sigaction(SIGINT,&act,(struct sigaction *)0);

    /* Set environment to reflect interactive */

    bld_env_variables(&vtable,"PBS_ENVIRONMENT","PBS_INTERACTIVE");

    /* get host where qsub resides */

    phost = arst_string("PBS_O_HOST",&pjob->ji_wattr[(int)JOB_ATR_variables]);
    pport = pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long;

    if ((phost == NULL) || ((phost = strchr(phost,'=')) == NULL)) 
      {
      log_err(-1,id,"PBS_O_HOST not set");

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);

      /*NOTREACHED*/

      exit(1);
      }

    phost++;

    qsub_sock = conn_qsub(phost,pport);

    if (qsub_sock < 0) 
      {
      log_err(errno,id,"cannot open qsub sock");

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);

      /*NOTREACHED*/

      exit(1);
      }

    FDMOVE(qsub_sock);

    /* send job id as validation to qsub */

    if (write(qsub_sock,pjob->ji_qs.ji_jobid,PBS_MAXSVRJOBID + 1) != PBS_MAXSVRJOBID + 1) 
      {
      log_err(errno,id,"cannot write jobid");

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);
      }

    /* receive terminal type and window size */

    if ((termtype = rcvttype(qsub_sock)) == NULL) 
      {
      log_err(errno,id,"cannot get termtype");

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);

      /*NOTREACHED*/
  
      exit(1);
      }

    bld_env_variables(&vtable,termtype,NULL);

    *(vtable.v_envp + vtable.v_used) = NULL;	/* null term */

    if (rcvwinsize(qsub_sock) == -1)
      {
      log_err(errno,id,"cannot get winsize");

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);

      /*NOTREACHED*/

      exit(1);
      }

    /* turn off alarm set around qsub connect activities */

    alarm(0);

    act.sa_handler = SIG_DFL;
    act.sa_flags   = 0;

    sigaction(SIGALRM,&act,NULL);

    /* set up the job session (update sjr) */

    j = set_job(pjob,&sjr);

    memcpy(TJE->sjr,&sjr,sizeof(sjr));

    if (j < 0) 
      {
      if (j == -1) 
        {
        /* set_job didn't leave message in log_buffer */

        strcpy(log_buffer,"unable to set session");
        }

      log_err(-1,id,log_buffer);

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr); 
      }

    /* open the slave pty as the controlling tty */
			
    if ((pts = open_pty(pjob)) < 0) 
      {
      log_err(errno,id,"cannot open slave");

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);
      }

    act.sa_handler = SIG_IGN;	/* setup to ignore SIGTERM */ 

    writerpid = fork();

    if (writerpid == 0) 
      {
      /* child is "writer" process */

      sigaction(SIGTERM,&act,NULL);

      close(TJE->upfds);
      close(TJE->downfds);
      close(pts);

      mom_writer(qsub_sock,TJE->ptc);

      shutdown(qsub_sock,2);

      exit(0);
      } 

    if (writerpid > 0) 
      {
      /*
      ** parent -- it first runs the prolog then forks
      ** again.  the child becomes the job while the
      ** parent becomes the reader.
      */

      close(1);
      close(2);
      dup2(pts,1);
      dup2(pts,2);

      fflush(stdout);
      fflush(stderr);

      set_termcc(pts);	/* set terminal control char */

      setwinsize(pts);	/* set window size to qsub's */

      /* run prolog - interactive job */

      if (run_pelog(
           PE_PROLOG,
           path_prolog, 
           pjob,
           PE_IO_TYPE_ASIS) != 0) 
        {

        log_err(-1,id,"interactive prolog failed");

        starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr);
        }

      /* run user prolog */

      if (run_pelog(
           PE_PROLOGUSER,
           path_prologuser,
           pjob,
           PE_IO_TYPE_ASIS) != 0)
        {

        log_err(-1,id,"interactive user prolog failed");

        starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr);

        /*NOTREACHED*/
        }

      shellpid = fork();

      if (shellpid == 0) 
        {
        /*********************************************/
        /* child - this will be the interactive job  */
        /* i/o is to slave tty			     */
        /*********************************************/

        close(0);

        dup2(pts,0);

        fflush(stdin);

        close(TJE->ptc);  /* close master side */
        close(pts);       /* dup'ed above */
        close(qsub_sock);

        /* continue setting up and exec-ing shell */
        } 
      else 
        {
        if (shellpid > 0) 
          {
          /* fork, parent is "reader" process  */

          sigaction(SIGTERM,&act,NULL);

          close(pts);
          close(TJE->upfds);
          close(TJE->downfds);
          close(1);
          close(2);

          sigemptyset(&act.sa_mask);

          act.sa_flags   = SA_NOCLDSTOP;
          act.sa_handler = catchinter;

          sigaction(SIGCHLD,&act,NULL);

          mom_reader_go = 1;
          mom_reader(qsub_sock,TJE->ptc);
          }
        else 
          {
          log_err(errno,id,"can't fork reader");
          }

        /* make sure qsub gets EOF */

        shutdown(qsub_sock,2);

        /* change pty back to available after job is done */

        chmod(TJE->ptc_name,0666);
        chown(TJE->ptc_name,0,0);

        exit(0);
        }
      } 
    else 
      { /* error */

      log_err(errno,id,"cannot fork nanny");

      /* change pty back to available */

      chmod(TJE->ptc_name,0666);
      chown(TJE->ptc_name,0,0);

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_RETRY,&sjr);
      }
    }    /* END if (TJE->is_interactive == TRUE) */ 
  else 
    {
    /*************************************************************/
    /*	We have a "normal" batch job, connect the standard	 */
    /*	streams to files					 */
    /*************************************************************/

    /* set Environment to reflect batch */

    bld_env_variables(&vtable,"PBS_ENVIRONMENT","PBS_BATCH");
    bld_env_variables(&vtable,"ENVIRONMENT","BATCH");

#if SHELL_USE_ARGV == 1
    /* connect stdin to /dev/null and feed the name of
     * the script on the command line */

    if (TJE->is_interactive == FALSE)
      script_in = open("/dev/null",O_RDONLY,0);
#elif SHELL_INVOKE == 1
    /* if passing script file name as input to shell */

    close(TJE->pipe_script[1]);
  
    script_in = TJE->pipe_script[0];
#else	/* SHELL_USE_ARGV || SHELL_INVOKE */
    /* if passing script itself as input to shell */

    strcpy(buf,path_jobs);
    strcat(buf,pjob->ji_qs.ji_fileprefix);
    strcat(buf,JOB_SCRIPT_SUFFIX);

    if ((script_in = open(buf,O_RDONLY,0)) < 0) 
      {
      if (errno == ENOENT)
        script_in = open("/dev/null",O_RDONLY,0);
      }
#endif  /* SHELL_USE_ARGV */

    if (script_in < 0) 
      {
      log_err(errno,id,"Unable to open script");

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);
      }

    FDMOVE(script_in);	/* make sure descriptor > 2 */

    if (script_in != 0) 
      {
      close(0);
      dup(script_in);
      close(script_in);
      }

    if (open_std_out_err(pjob) == -1) 
      {
      log_err(-1,id,"unable to open stdout/stderr descriptors");

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL1,&sjr);
      }

    /* run prolog - standard batch job */
		
    if ((j = run_pelog(
        PE_PROLOG,
        path_prolog, 
        pjob, 
        PE_IO_TYPE_ASIS)) != 0) 
      {
      log_err(-1,id,"batch job prolog failed");

      if (j == 1)
        {
        /* permanent failure - abort job */

        starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr);
        }
      else
        {
        /* retry - requeue job */

        starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_RETRY,&sjr);
        }

      /*NOTREACHED*/
      } 

    /* run user prolog */

    if ((j = run_pelog(
        PE_PROLOGUSER,
        path_prologuser,
        pjob,
        PE_IO_TYPE_ASIS)) != 0)
      {
      log_err(-1,id,"batch job user prolog failed");

      if (j == 1)
        {
        /* permanent failure - abort job */

        starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr);
        }
      else
        {
        /* retry - requeue job */

        starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_RETRY,&sjr);
        }

      /*NOTREACHED*/
      }

    /* set up the job session (update sjr) */

    j = set_job(pjob,&sjr);

    memcpy(TJE->sjr,&sjr,sizeof(sjr));

    if (j < 0) 
      {
      /* FAILURE */

      if (j != -2) 
        {
        /* set_job didn't leave message in log_buffer */

        strcpy(log_buffer,"Unable to set session");
        }

      /* set_job leaves message in log_buffer */

      log_err(-1,id,log_buffer);

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr);

      /*NOTREACHED*/
      }
    }    /* END else (TJE->is_interactive == TRUE) */

  /***********************************************************************/
  /*	Set resource limits				 		 */
  /*	Both normal batch and interactive job come through here 	 */
  /*                                                                     */
  /*    output fds to the user are setup at this point, so write() all   */
  /*    errors (with a \n) directly to the user on fd 2 and fscync(2) it */
  /***********************************************************************/

  pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long = sjr.sj_session;

  pjob->ji_wattr[(int)JOB_ATR_session_id].at_flags =
    ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_SEND;

  /* leaving a note for myself to check this later...
     why is it necessary to set JOB_ATR_session_id above?  We are a child process
     and setting that attr should be useless.  But if it isn't set, MOM sometimes
     SIGKILLs herself with interactive jobs -garrick */

  if (site_job_setup(pjob) != 0) 
    {
    /* FAILURE */

    sprintf(log_buffer,"PBS: site specific job setup failed\n");

    write(2,log_buffer,strlen(log_buffer));

    fsync(2);

    starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr); /* exits */

    /*NOTREACHED*/
    }

  log_buffer[0] = '\0';

  if ((i = mom_set_limits(pjob,SET_LIMIT_SET)) != PBSE_NONE) 
    {
    if (log_buffer[0] != '\0')
      {
      /* report error to user via stderr file */

      write(2,log_buffer,strlen(log_buffer));

      fsync(2);
      }

    if (i == PBSE_RESCUNAV)	
      {	
      /* resource temp unavailable */

      if (TJE->is_interactive == TRUE)
        j = JOB_EXEC_FAIL2;
      else
        j = JOB_EXEC_RETRY;
      }
    else
      {
      j = JOB_EXEC_FAIL2;
      }

    starter_return(TJE->upfds,TJE->downfds,j,&sjr); /* exits */

    /*NOTREACHED*/

    return(-1);
    }


  endpwent();

  if ((idir = get_job_envvar(pjob,"PBS_O_ROOTDIR")) != NULL)
    {
    if (chroot(idir) == -1)
      {
      sprintf(log_buffer,"PBS: chroot to '%.256s' failed: %s\n",
        idir,
        strerror(errno));

      write(2,log_buffer,strlen(log_buffer));

      fsync(2);

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr);

      /*NOTREACHED*/

      return(-1);
      }
    }

  /*
   * become the user, execv the shell and become the real job 
   */

  setgroups(
    pjob->ji_grpcache->gc_ngroup,
    (gid_t *)pjob->ji_grpcache->gc_groups);

  setgid(pjob->ji_qs.ji_un.ji_momt.ji_exgid);

  setuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid);

#ifdef _CRAY
  seteuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid); /* cray kludge */
#endif	/* CRAY */

  /*
   * cwd to PBS_O_INITDIR if specified, otherwise User's Home
   */

  if ((idir = get_job_envvar(pjob,"PBS_O_INITDIR")) != NULL)
    {
    /* in TMomFinalizeChild() executed as user */

    if (chdir(idir) == -1)
      {
      sprintf(log_buffer,"PBS: chdir to '%.256s' failed: %s\n",
        idir,
        strerror(errno));

      write(2,log_buffer,strlen(log_buffer));

      fsync(2);

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr);

      /*NOTREACHED*/

      return(-1);
      }
    }
  else 
    {
    /* in TMomFinalizeChild() executed as user */

    if (chdir(pwdp->pw_dir) == -1)
      {
      sprintf(log_buffer,"PBS: chdir to '%.256s' failed: %s\n",
        pwdp->pw_dir,
        strerror(errno));

      write(2,log_buffer,strlen(log_buffer));

      fsync(2);

      starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_FAIL2,&sjr);

      /*NOTREACHED*/

      return(-1);
      }
    }
	
  /* X11 forwarding init */

  if ((TJE->is_interactive == TRUE) && pjob->ji_wattr[(int)JOB_ATR_forwardx11].at_val.at_str)
    {
    char display[512];

    if(x11_create_display(1, /* use localhost only */
                          display, /* output */
                          phost, pport,
                          pjob->ji_grpcache->gc_homedir,
                          pjob->ji_wattr[(int)JOB_ATR_forwardx11].at_val.at_str) >= 0)
      {
      bld_env_variables(&vtable,"DISPLAY",display);
      }
    else
      {
      sprintf(log_buffer,"PBS: X11 forwarding init failed\n");

      write(2,log_buffer,strlen(log_buffer));

      fsync(2);
      }
    }

  /* NULL terminate the envp array, This is MUST DO */

  *(vtable.v_envp + vtable.v_used) = NULL;

  /* tell mom we are going */

  starter_return(TJE->upfds,TJE->downfds,JOB_EXEC_OK,&sjr);

  log_close(0);  /* FIXME:  this is useless, right? */

  if ((pjob->ji_numnodes == 1) ||
     ((cpid = fork()) > 0)) 
    {	
    /* parent does the shell */

    /* close sockets that child uses */

    if (pjob->ji_stdout >= 0)
      close(pjob->ji_stdout);

    if (pjob->ji_stderr >= 0)
      close(pjob->ji_stderr);

    /* construct argv array */

    shellname = strrchr(shell,'/');

    if (shellname != NULL)
      ++shellname;	/* go past last '/' */
    else
      shellname = shell;

    arg[0] = malloc(strlen(shellname) + 2);

    strcpy(arg[0],"-");

    strcat(arg[0],shellname);

    arg[1] = NULL;

#if SHELL_USE_ARGV == 1
    if (TJE->is_interactive == FALSE) 
      {
      arg[1] = malloc(
        strlen(path_jobs) +
        strlen(pjob->ji_qs.ji_fileprefix) +
        strlen(JOB_SCRIPT_SUFFIX) + 1);

      strcpy(arg[1],path_jobs);
      strcat(arg[1],pjob->ji_qs.ji_fileprefix);
      strcat(arg[1],JOB_SCRIPT_SUFFIX);

      arg[2] = NULL;
      } 
#endif /* SHELL_USE_ARGV */

    if (TJE->is_interactive == TRUE) 
      {
      struct sigaction act;

      /* restore SIGINT so that the child shell can use ctrl-c */

      sigemptyset(&act.sa_mask);
      act.sa_flags   = 0;
      act.sa_handler = SIG_DFL;

      sigaction(SIGINT,&act,(struct sigaction *)0);
      }

    execve(shell,arg,vtable.v_envp);
    }
  else if (cpid == 0)
    {	
    /* child does demux */
 
    char *demux = DEMUX;

    /* setup descriptors 3 and 4 */

    dup2(pjob->ji_stdout,3);

    if (pjob->ji_stdout > 3)
      close(pjob->ji_stdout);

    dup2(pjob->ji_stderr,4);

    if (pjob->ji_stderr > 4)
      close(pjob->ji_stderr);

    /* construct argv array */

    shellname = strrchr(demux,'/');

    if (shellname != NULL)
      ++shellname;	/* go past last '/' */
    else
      shellname = shell;

    arg[0] = malloc(strlen(shellname) + 1);

    strcpy(arg[0],shellname);

    arg[1] = NULL;

    execve(demux,arg,vtable.v_envp);

    /* reached only if execve fails */

    shell = demux;  /* for fprintf below */
    }  /* END else if (cpid == 0) */

  sprintf(log_buffer,"PBS: exec of shell '%.256s' failed\n",
    shell);

  write(2,log_buffer,strlen(log_buffer));

  fsync(2);

  if (strlen(shell) == 0)
    {
    extern char mom_host[];

    DBPRT(("user \"%s\" may not have a shell defined on node \"%s\"\n",
      pwdp->pw_name,
      mom_host));
    }
  else if (strstr(shell,"/bin/false") != NULL)
    {
    extern char mom_host[];

    DBPRT(("user \"%s\" has shell \"/bin/false\" on node \"%s\"\n",
      pwdp->pw_name,
      mom_host));
    }
  else
    {
    struct stat buf;

    if (stat(shell, &buf) != 0)
      {
      DBPRT(("stat of shell \"%s\" failed with error %d\n",
        shell, 
        errno));
      }
    else if (S_ISREG(buf.st_mode) == 0)
      {
      DBPRT(("shell \"%s\" is not a file\n",
        shell));
      }
    else if ((buf.st_mode & S_IXUSR) != 0)
      {
      DBPRT(("shell \"%s\" is not executable by user \"%s\"\n",
        shell,
        pwdp->pw_name));
      }
    }

  exit(254);	/* should never, ever get here */

  /*NOTREACHED*/

  return(-1);
  }  /* END TMomFinalizeChild() */





/* child has already reported in via pipe.  Perform final job tasks */
/* change pjob substate from JOB_SUBSTATE_PRERUN to JOB_SUBSTATE_RUNNING */

int TMomFinalizeJob3(

  pjobexec_t *TJE,        /* I (modified) */
  int         ReadSize,   /* I (bytes read from child pipe) */
  int         ReadErrno,  /* I (errno value from read) */
  int        *SC)         /* O (return code) */

  {
  char *id = "TMomFinalizeJob3";

  struct startjob_rtn sjr;

  job  *pjob;
  task *ptask;

  pjob = (job *)TJE->pjob;
  ptask = (task *)TJE->ptask;

  /* sjr populated in TMomFinalizeJob2() */

  memcpy(&sjr,TJE->sjr,sizeof(sjr));

  close(TJE->jsmpipe[0]);

  if (ReadSize != sizeof(sjr))
    {
    /* FAILURE */

    sprintf(log_buffer,"read of pipe for sid failed for job %s (%d of %d bytes)",
      pjob->ji_qs.ji_jobid,
      ReadSize,
      (int)sizeof(sjr));

    log_err(ReadErrno,id,log_buffer);

    sprintf(log_buffer,"start failed, improper sid");

    log_record(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      id,
      log_buffer);

    close(TJE->mjspipe[1]);

    *SC = JOB_EXEC_RETRY;

    return(FAILURE);
    }

  /* send back as an acknowledgement that MOM got it */

  write(TJE->mjspipe[1],&sjr,sizeof(sjr));

  close(TJE->mjspipe[1]);

  if (LOGLEVEL >= 3)
    {
    sprintf(log_buffer,"read start return code=%d session=%ld",
      sjr.sj_code,
      (long)sjr.sj_session);

    log_record(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      id,
      log_buffer);
    }

  if (sjr.sj_code < 0)
    {
    char tmpLine[1024];

    /* FAILURE */

    tmpLine[0] = '\0';

    switch(sjr.sj_code)
      {
      case JOB_EXEC_OK:  /* 0 */

        strcpy(tmpLine,"no failure");

        break;

      case JOB_EXEC_FAIL1:  /* -1 */

        strcpy(tmpLine,"job exec failure, before files staged, no retry");

        break;

      case JOB_EXEC_FAIL2:  /* -2 */

        strcpy(tmpLine,"job exec failure, after files staged, no retry");

        break;

      case JOB_EXEC_RETRY: /* -3 */

        strcpy(tmpLine,"job exec failure, retry will be attempted");

        break;

      default:

        sprintf(tmpLine,"job exec failure, code=%d",
          sjr.sj_code);

        break;
      }

    sprintf(log_buffer,"job not started, %s %s",
      (sjr.sj_code == JOB_EXEC_RETRY) ? "Retry" : "Failure",
      tmpLine);

    log_record(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      id,
      log_buffer);

    *SC = sjr.sj_code;

    return(FAILURE);
    }  /* END if (sjr.sj_code < 0) */

  /* pjob modified */

  set_globid(pjob,&sjr);

  ptask->ti_qs.ti_sid    = sjr.sj_session;
  ptask->ti_qs.ti_status = TI_STATE_RUNNING;

  strcpy(ptask->ti_qs.ti_parentjobid,pjob->ji_qs.ji_jobid);

  if (LOGLEVEL >= 6)
    {
    log_record(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      "saving task (TMomFinalizeJob3)");
    }

  if (task_save(ptask) == -1)
    {
    /* FAILURE */

    sprintf(log_buffer,"Task save failed");

    log_record(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      id,
      log_buffer);

    *SC = JOB_EXEC_RETRY;

    return(FAILURE);
    }

  if (pjob->ji_numnodes > 1)
    {
    /*
    ** Put port numbers into job struct and close sockets.
    ** The job uses them to talk to demux, but main MOM
    ** doesn't need them.   The port numbers are stored
    ** here for use in start_process(), to connect to
    ** pbs_demux.
    */

    close(pjob->ji_stdout);

    pjob->ji_stdout = TJE->port_out;

    close(pjob->ji_stderr);

    pjob->ji_stderr = TJE->port_err;
    }

  /* return from the starter indicated the job is a go ... */
  /* record the start time and session/process id */

  pjob->ji_wattr[(int)JOB_ATR_session_id].at_val.at_long = sjr.sj_session;

  pjob->ji_wattr[(int)JOB_ATR_session_id].at_flags =
    ATR_VFLAG_SET | ATR_VFLAG_MODIFY | ATR_VFLAG_SEND;

  pjob->ji_qs.ji_state    = JOB_STATE_RUNNING;
  pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;
  pjob->ji_qs.ji_stime    = time_now;

  /* changed from SAVEJOB_QUICK to SAVEJOB_FULL (USC - 2/5/2005) */

  job_save(pjob,SAVEJOB_FULL);

  sprintf(log_buffer,"job %s started, pid = %ld",
    pjob->ji_qs.ji_jobid,
    (long)sjr.sj_session);

  log_record(
    PBSEVENT_ERROR,
    PBS_EVENTCLASS_JOB,
    id,
    log_buffer);

  return(SUCCESS);
  }  /* END TMomFinalizeJob3() */





/*
** Start a process for a spawn request.  This will be different from
** a job's initial shell task in that the environment will be specified
** and no interactive code need be included.
*/

int start_process(

  task	 *ptask,  /* I */
  char	**argv,
  char	**envp)

  {
  static char id[] = "start_process";

  char  *idir;
  job	*pjob = ptask->ti_job;
  pid_t	pid;
  int	pipes[2], kid_read, kid_write, parent_read, parent_write;
  int	pts;
  int	i, j;
  int	fd0, fd1, fd2;
  u_long ipaddr;
  struct  startjob_rtn sjr = {0,0};

  if (pipe(pipes) == -1)
    {
    return(-1);
    }

  if (pipes[1] < 3) 
    {
    kid_write = fcntl(pipes[1],F_DUPFD,3);

    close(pipes[1]);
    }
  else
    {
    kid_write = pipes[1];
    }

  parent_read = pipes[0];

  if (pipe(pipes) == -1)
    {
    return(-1);
    }

  if (pipes[0] < 3) 
    {
    kid_read = fcntl(pipes[0],F_DUPFD,3);

    close(pipes[0]);
    }
  else
    {
    kid_read = pipes[0];
    }

  parent_write = pipes[1];

  /*
  ** Get ipaddr to Mother Superior.
  */

  if (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE)	/* I'm MS */
    {
    ipaddr = htonl(localaddr);
    }
  else 
    {
    struct sockaddr_in	*ap;

    /*
    ** We always have a stream open to MS at node 0.
    */

    i = pjob->ji_hosts[0].hn_stream;

    if ((ap = rpp_getaddr(i)) == NULL) 
      {
      sprintf(log_buffer,"job %s has no stream to MS",
        pjob->ji_qs.ji_jobid);

      log_err(-1,id,log_buffer);

      return(-1);
      }

    ipaddr = ap->sin_addr.s_addr;
    }  /* END else (pjob->ji_qs.ji_svrflags & JOB_SVFLG_HERE) */

  /* A restarted mom will not have called this yet, but it is needed
   * to spawn tasks (ji_grpcache).
   */

  if (!check_pwd(pjob)) 
    {
    log_err(-1,id,log_buffer);

    return(-1);
    }

  /*
  ** Begin a new process for the fledgling task.
  */

  if ((pid = fork_me(-1)) == -1)
    {
    /* fork failed */

    return(-1);
    }

  if (pid != 0) 
    {		
    /* parent */
    int gotsuccess=0;

    close(kid_read);
    close(kid_write);

    /* read sid */

    for (;;) 
      {
      i = read(parent_read,(char *)&sjr,sizeof(sjr));

      if ((i == -1) && (errno == EINTR))
        continue;

      if ((i == sizeof(sjr)) && (sjr.sj_code == 0) && !gotsuccess)
        {
        gotsuccess=1;
        write(parent_write,&sjr,sizeof(sjr));
        continue;
        }

      if (gotsuccess)
        {
        i=sizeof(sjr);
        }

      break;
      }  /* END for(;;) */

    j = errno;

    close(parent_read);

    if (i != sizeof(sjr)) 
      {
      sprintf(log_buffer,"read of pipe for sid job %s got %d not %ld (errno: %d, %s)",
        pjob->ji_qs.ji_jobid, 
        i, 
        (long)sizeof(sjr),
        j,
        strerror(j));

      log_err(j,id,log_buffer);
 
      close(parent_write);

      return(-1);
      }

    write(parent_write,&sjr,sizeof(sjr));

    close(parent_write);

    DBPRT(("%s: read start return %d %ld\n", 
      id,
      sjr.sj_code, 
      (long)sjr.sj_session))

    if (sjr.sj_code < 0) 
      {
      char tmpLine[1024];

      tmpLine[0] = '\0';

      switch(sjr.sj_code)
        {
        case JOB_EXEC_OK:  /* 0 */

          /* NO-OP */

          break;

        case JOB_EXEC_FAIL1:  /* -1 */

          strcpy(tmpLine,"stdio setup failed");

          break;

        case JOB_EXEC_FAIL2:  /* -2 */

          strcpy(tmpLine,"env setup or user dir problem");

          break;

        case JOB_EXEC_RETRY: /* -3 */

          strcpy(tmpLine,"unable to set limits, retry will be attempted");

          break;

        case JOB_EXEC_CMDFAIL: /* -8 */

          strcpy(tmpLine,"command exec failed");

          break;

        default:

          sprintf(tmpLine,"code=%d",
            sjr.sj_code);

          break;
        }  /* END switch (sjr.sj_code) */
 
      sprintf(log_buffer,"task not started, '%s', %s (see syslog)",
        argv[0],
        tmpLine);

      log_record(
        PBSEVENT_ERROR,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);

      return(-1);
      }

    set_globid(pjob,&sjr);

    ptask->ti_qs.ti_sid = sjr.sj_session;
    ptask->ti_qs.ti_status = TI_STATE_RUNNING;

    if (LOGLEVEL >= 6)
      {
      log_record(
        PBSEVENT_ERROR,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        "task set to running/saving task (start_process)");
      }

    task_save(ptask);

    if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) 
      {
      pjob->ji_qs.ji_state    = JOB_STATE_RUNNING;
      pjob->ji_qs.ji_substate = JOB_SUBSTATE_RUNNING;

      job_save(pjob,SAVEJOB_QUICK);
      }

    sprintf(log_buffer,"%s: task started, tid %d, sid %ld, cmd %s",
      id, 
      ptask->ti_qs.ti_task, 
      (long)ptask->ti_qs.ti_sid, 
      argv[0]);

    log_record(
      PBSEVENT_JOB, 
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid, 
      log_buffer);

    return(0);
    }  /* END else if (pid != 0) */

  /************************************************/
  /* The child process - will become the TASK	  */
  /************************************************/

  if (lockfds >= 0)
    {
    close(lockfds);

    lockfds = -1;
    }

  close(parent_read);
  close(parent_write);

  /*
   * set up the Environmental Variables to be given to the job 
   */

  if (InitUserEnv(pjob,ptask,envp,NULL,NULL) < 0)
    {
    log_err(errno,id,"failed to setup user env");

    starter_return(kid_write,kid_read,JOB_EXEC_RETRY,&sjr);
    }

  if (set_mach_vars(pjob,&vtable) != 0) 
    {
    strcpy(log_buffer,"PBS: machine dependent environment variable setup failed\n");

    log_err(errno,id,log_buffer);

    starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);

    /*NOTREACHED*/

    exit(1);
    }

  umask(077);

  /* set Environment to reflect batch */

  bld_env_variables(&vtable,"PBS_ENVIRONMENT","PBS_BATCH");
  bld_env_variables(&vtable,"ENVIRONMENT",    "BATCH");

  /* NULL terminate the envp array, This is MUST DO */

  *(vtable.v_envp + vtable.v_used) = NULL;

  /*
  ** Set up stdin.
  */

  /* look through env for a port# on MS we should use for stdin */

  if ((fd0 = search_env_and_open("MPIEXEC_STDIN_PORT",ipaddr)) == -2)
    starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);

  if (fd0 < 0)
    if ((fd0 = search_env_and_open("TM_STDIN_PORT",ipaddr)) == -2)
      starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);

  /* use /dev/null if no env var found */

  if ((fd0 < 0) && (fd0 = open("/dev/null",O_RDONLY)) == -1) 
    {
    log_err(errno,id,"could not open dev/null");

    close(0);
    }
  else 
    {
    dup2(fd0,0);

    if (fd0 > 0)
      close(fd0);
    }

  /* look through env for a port# on MS we should use for stdout/err */

  if ((fd1 = search_env_and_open("MPIEXEC_STDOUT_PORT",ipaddr)) == -2)
    starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);

  if (fd1 < 0)
    if ((fd1 = search_env_and_open("TM_STDOUT_PORT",ipaddr)) == -2)
      starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);

  if ((fd2 = search_env_and_open("MPIEXEC_STDERR_PORT",ipaddr)) == -2)
    starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);

  if (fd2 < 0)
    if ((fd2 = search_env_and_open("TM_STDERR_PORT",ipaddr)) == -2)
      starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);

  if (pjob->ji_numnodes > 1) 
    {
    /*
    ** Open sockets to demux proc for stdout and stderr.
    */

    if ((fd1 < 0) && ((fd1 = open_demux(ipaddr,pjob->ji_stdout)) == -1))
      {
      starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);
  
      /*NOTREACHED*/

      exit(1);
      }

    dup2(fd1,1);

    if (fd1 > 1)
      close(fd1);

    if ((fd2 < 0) && ((fd2 = open_demux(ipaddr,pjob->ji_stderr)) == -1))
      {
      starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);

      /*NOTREACHED*/

      exit(1);
      }

    dup2(fd2,2);

    if (fd2 > 2)
      close(fd2);

    /* never send cookie - PW mpiexec patch */

    /*	
    write(1,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str,
      strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str));

    write(2,pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str,
      strlen(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str));
    */
    } 
  else if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags&ATR_VFLAG_SET) &&
           (pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long > 0)) 
    {
    /* interactive job, single node, write to pty */

    pts = -1;

    if ((fd1 < 0) || (fd2 < 0)) 
      {
      if ((pts = open_pty(pjob)) < 0) 
        {
        log_err(errno,id,"cannot open slave pty");

        starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);
        }

      if (fd1 < 0)
        fd1 = pts;

      if (fd2 < 0)
        fd2 = pts;
      }

    dup2(fd1,1);
    dup2(fd2,2);

    if (fd1 != pts)
      close(fd1);

    if (fd2 != pts)
       close(fd2);
    }
  else 
    {
    /* normal batch job, single node, write straight to files */

    pts = -1;

    if ((fd1 < 0) || (fd2 < 0))
      {
      if (open_std_out_err(pjob) == -1)
        {
        log_err(errno,id,"cannot open job stderr/stdout files");

        starter_return(kid_write,kid_read,JOB_EXEC_FAIL1,&sjr);
        }
      }

    if (fd1 >= 0)
      {
      close(1);
      dup2(fd1,1);

      if (fd1 > 1)
        close(fd1);
      }

    if (fd2 >= 0)
      {
      close(2);
      dup2(fd2,2);

      if (fd2 > 2)
        close(fd2);
      }
    }    /* END else */

  /*******************************************************
   * At this point, output fds are setup for the job,
   * any further error messages should be written
   * directly to fd 2, with a \n, and ended with fsync(2)
   *******************************************************/

  j = set_job(pjob,&sjr);

  if (j < 0) 
    {
    if (j != -2) 
      {
      /* set_job didn't leave message in log_buffer */

      strcpy(log_buffer,"PBS: Unable to set task session\n");
      }

    write(2,log_buffer,strlen(log_buffer));

    fsync(2);

    starter_return(kid_write,kid_read,JOB_EXEC_FAIL2,&sjr);
    }

  ptask->ti_qs.ti_sid = sjr.sj_session;

  log_buffer[0] = '\0';

  if ((i = mom_set_limits(pjob,SET_LIMIT_SET)) != PBSE_NONE) 
    {
    if (log_buffer[0] != '\0')
      {
      /* report error to user via stderr file */
      write(2,log_buffer,strlen(log_buffer));

      fsync(2);
      }

    sprintf(log_buffer,"PBS: Unable to set limits, err=%d\n",
      i);

    write(2,log_buffer,strlen(log_buffer));

    fsync(2);

    if (i == PBSE_RESCUNAV)		/* resource temp unavailable */
      j = JOB_EXEC_RETRY;
    else
      j = JOB_EXEC_FAIL2;

    starter_return(kid_write,kid_read,j,&sjr);
    }

  if ((idir = get_job_envvar(pjob,"PBS_O_ROOTDIR")) != NULL)
    {
    if (chroot(idir) == -1)
      {
      sprintf(log_buffer,"PBS: chroot to %.256s failed: %s\n",
        idir,
        strerror(errno));

      write(2,log_buffer,strlen(log_buffer));

      fsync(2);

      starter_return(kid_write, kid_read, JOB_EXEC_FAIL2, &sjr);
      }
    }

  /* become the user and  execv the shell and become the real job */

  setgroups(pjob->ji_grpcache->gc_ngroup,
    (gid_t *)pjob->ji_grpcache->gc_groups);

  setgid(pjob->ji_qs.ji_un.ji_momt.ji_exgid);
  setuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid);

#ifdef _CRAY
  seteuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid); /* cray kludge */
#endif /* CRAY */

  /* cwd to PBS_O_INITDIR if specified, otherwise User's Home */

  if ((idir = get_job_envvar(pjob,"PBS_O_INITDIR")) != NULL)
    {
    /* in start_process() executed as user */

    if (chdir(idir) == -1)
      {
      sprintf(log_buffer,"PBS: chdir to %.256s failed: %s\n",
        idir,
        strerror(errno));

      write(2,log_buffer,strlen(log_buffer));

      fsync(2);

      starter_return(kid_write,kid_read,JOB_EXEC_FAIL2,&sjr);
      }
    }
  else 
    {
    /* in start_process() executed as user */

    if (chdir(pjob->ji_grpcache->gc_homedir) == -1)
      {
      sprintf(log_buffer,"PBS: chdir to %.256s failed: %s\n",
        pjob->ji_grpcache->gc_homedir,
        strerror(errno));

      write(2,log_buffer,strlen(log_buffer));

      fsync(2);

      starter_return(kid_write,kid_read,JOB_EXEC_FAIL2,&sjr);
      }
    }

  starter_return(
    kid_write, 
    kid_read, 
    JOB_EXEC_OK, 
    &sjr);

  fcntl(kid_write, F_SETFD, FD_CLOEXEC);
#if 0	/* def DEBUG */
  for (i=3; i< 40; ++i) 
    {	/* check for any extra open descriptors */
    if (close(i) >= 0)
      fprintf(stderr, "Closed file %d\n", i);
    }
#endif	/* DEBUG */

  environ = vtable.v_envp;

  execvp(argv[0],argv);

  /* only reached if execvp() fails */

  sprintf(log_buffer,"PBS: %.256s: %s\n", 
    argv[0],
    strerror(errno));

  write(2,log_buffer,strlen(log_buffer));

  fsync(2);

  starter_return(kid_write,kid_read,JOB_EXEC_CMDFAIL,&sjr);

  exit(254);

  /*NOTREACHED*/

  return(-1);
  }  /* END start_process() */




/*
**	Free the ji_hosts and ji_vnods arrays for a job.  If any events are
**	attached to an array element, free them as well.
*/

void nodes_free(

  job *pj)

  {
  void	arrayfree  A_((char **array));
  hnodent	 *np;

  if (pj->ji_vnods) 
    {
    free(pj->ji_vnods);

    pj->ji_vnods = NULL;
    }

  if (pj->ji_hosts != NULL) 
    {
    for (np = pj->ji_hosts;np->hn_node != TM_ERROR_NODE;np++) 
      {
      eventent *ep = (eventent *)GET_NEXT(np->hn_events);

      if (np->hn_host)
        free(np->hn_host);

      /* don't close stream incase another job uses it */

      while (ep) 
        {
        if (ep->ee_argv)
          arrayfree(ep->ee_argv);

        if (ep->ee_envp)
          arrayfree(ep->ee_envp);

        delete_link(&ep->ee_next);

        free(ep);

        ep = (eventent *)GET_NEXT(np->hn_events);
        }  /* END while (ep) */
      }    /* END for (np) */

    free(pj->ji_hosts);

    pj->ji_hosts = NULL;
    }  /* END if (pj->ji_hosts != NULL) */

  return;
  }  /* END nodes_free() */
  




/*
**	Generate array hosts & vnodes for a job from the exec_host attribute.
**	Call nodes_free() just in case we have seen this job before.
**	Parse exec_host first to count the number of nodes and allocate
**	an array of nodeent's.  Then, parse it again to get the hostname
**	of each node and init the other fields of each nodeent element.
**	The final element will have the ne_node field set to TM_ERROR_NODE.
*/

void job_nodes(

  job *pjob)  /* I */

  {
  char         *id = "job_nodes";

  int		i, j, nhosts, nodenum;
  int		ix;
  char		*cp, *nodestr;
  hnodent	*hp;
  vnodent	*np;
  extern	char	mom_host[];

  nodes_free(pjob);

  nodenum = 1;

  if (pjob->ji_wattr[(int)JOB_ATR_exec_host].at_flags &
      ATR_VFLAG_SET) 
    {
    nodestr = pjob->ji_wattr[(int)JOB_ATR_exec_host].at_val.at_str;

    if (nodestr != NULL) 
      {
      for (cp = nodestr;*cp;cp++) 
        {
        if (*cp == '+')
          nodenum++;
        }
      }
    } 
  else
    {
    nodestr = mom_host;
    }

  pjob->ji_hosts = (hnodent *)calloc(nodenum + 1,sizeof(hnodent));
  pjob->ji_vnods = (vnodent *)calloc(nodenum + 1,sizeof(vnodent));

  assert(pjob->ji_hosts);
  assert(pjob->ji_vnods);

  pjob->ji_numvnod = nodenum;

  nhosts = 0;

  np = pjob->ji_vnods;

  for (i = 0;i < nodenum;i++,np++) 
    {
    char *dp, nodename[MAXPATHLEN + 1];
		
    ix = 0;

    for (cp = nodestr,dp = nodename;*cp;cp++,dp++) 
      {
      if (*cp == '/') 
        {
        ix = atoi(cp + 1);

        while ((*cp != '\0') && (*cp != '+'))
          ++cp;

        if (*cp == '\0') 
          {
          nodestr = cp;

          break;
          }
        }

      if (*cp == '+') 
        {
        nodestr = cp + 1;

        break;
        }

      *dp = *cp;
      }

    *dp = '\0';

    /* see if we already have this host */

    for (j = 0;j < nhosts;++j) 
      {
      if (strcmp(nodename,pjob->ji_hosts[j].hn_host) == 0)
        break;
      }

    hp = &pjob->ji_hosts[j];

    if (j == nhosts) 
      {	
      /* need to add host to tn_host */

      hp->hn_node = nhosts++;
      hp->hn_stream = -1;
      hp->hn_sister = SISTER_OKAY;
      hp->hn_host = strdup(nodename);

      CLEAR_HEAD(hp->hn_events);
      }

    np->vn_node  = i;	/* make up node id */
    np->vn_host  = &pjob->ji_hosts[j];
    np->vn_index = ix;

    if (LOGLEVEL >= 4)
      {
      sprintf(log_buffer,"%d: %s/%d",
        np->vn_node, 
        np->vn_host->hn_host, 
        np->vn_index);

      log_record(
        PBSEVENT_ERROR,
        PBS_EVENTCLASS_JOB,
        id,
        log_buffer);
      }
    }  /* END for (i) */

  np->vn_node = TM_ERROR_NODE;

  pjob->ji_hosts[nhosts].hn_node = TM_ERROR_NODE;
  pjob->ji_numnodes = nhosts;
  pjob->ji_numvnod  = nodenum;

  if (LOGLEVEL >= 2)
    {
    sprintf(log_buffer,"job: %s numnodes=%d numvnod=%d",
      pjob->ji_qs.ji_jobid,
      nhosts,
      nodenum);

    log_record(
      PBSEVENT_ERROR,
      PBS_EVENTCLASS_JOB,
      id,
      log_buffer);
    }

  return;
  }  /* END job_nodes() */


/* start_exec()
   TMomFinalizeJob1() 
   TMomFinalizeJob2() 
   TMomFinalizeJob3() */




/*
 * start_exec() - start execution of a job
 *  job newly allocated, and added to svr_alljobs *
 *  pjob->ji_qs.ji_state = JOB_STATE_RUNNING
 *  pjob->ji_qs.ji_substate = JOB_SUBSTATE_PRERUN *
 */

void start_exec(

  job *pjob) /* I (modified) */

  {
  static char	*id = "start_exec";

  eventent	*ep;
  int		i, nodenum;
  int		ports[2], socks[2];
  struct	sockaddr_in saddr;
  hnodent	*np;
  attribute	*pattr;
  tlist_head	phead;
  svrattrl	*psatl;
  int		stream;
  char		tmpdir[MAXPATHLEN];

  torque_socklen_t slen;

  void im_compose A_((int stream,
    char	*jobid,
    char	*cookie,
    int		command,
    tm_event_t	event,
    tm_task_id	taskid));

  if (!(pjob->ji_wattr[(int)JOB_ATR_Cookie].at_flags & ATR_VFLAG_SET)) 
    {
    char		*tt;
    extern time_t	loopcnt;
    MD5_CTX		c;
    int			i;

    /* alloc 33 bytes? */

    tt = pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str = malloc(33);

    pjob->ji_wattr[(int)JOB_ATR_Cookie].at_flags |= ATR_VFLAG_SET;

    loopcnt++;

    MD5Init(&c);

    MD5Update(&c,(caddr_t)&loopcnt,sizeof(loopcnt));

    MD5Update(&c,(caddr_t)pjob,sizeof(job));

    MD5Final(&c);

    for (i = 0;i < 16;i++)
      {
      sprintf(&tt[i * 2],"%02X", 
        c.digest[i]);
      }

    DBPRT(("===== MD5 %s\n",
      tt))
    }  /* END if () */

  job_nodes(pjob);

  /* start_exec only run on mother superior */

  pjob->ji_nodeid = 0;	/* I'm MS */

  nodenum = pjob->ji_numnodes;

  /* We do this early because we need the uid/gid for TMakeTmpDir */

  if (!check_pwd(pjob)) 
    {
    log_err(-1,id,log_buffer);

    exec_bail(pjob,JOB_EXEC_FAIL1);

    return;
    }

  /* should we make a tmpdir? */

  if (TTmpDirName(pjob,tmpdir))
    {
    if (!TMakeTmpDir(pjob,tmpdir))
      {
      exec_bail(pjob,JOB_EXEC_FAIL1);

      return;
      }
    }
  

  /* if nodecount > 1, return once joins are sent, if nodecount == 1, return once job is started */

  if (nodenum > 1) 
    {
    pjob->ji_resources = (noderes *)calloc(nodenum - 1,sizeof(noderes));

    assert(pjob->ji_resources != NULL);

    CLEAR_HEAD(phead);

    pattr = pjob->ji_wattr;

    for (i = 0;i < (int)JOB_ATR_LAST;i++) 
      {
      (job_attr_def + i)->at_encode(
        pattr + i, 
        &phead,
        (job_attr_def + i)->at_name, 
        NULL,
        ATR_ENCODE_MOM);
      }  /* END for (i) */

    attrl_fixlink(&phead);

    /*
    **  Open streams to the sisterhood.
    */

    for (i = 1;i < nodenum;i++) 
      {
      np = &pjob->ji_hosts[i];

      log_buffer[0] = '\0';

      /* rpp_open() will succeed even if MOM is down */

      np->hn_stream = rpp_open(np->hn_host,pbs_rm_port,log_buffer);

      if (np->hn_stream < 0) 
        {
        if (log_buffer[0] != '\0')
          {
          sprintf(log_buffer,"rpp_open failed on %s",
            np->hn_host);
          }

        log_err(errno,id,log_buffer);

	pjob->ji_nodekill = i;
	
        exec_bail(pjob,JOB_EXEC_FAIL1);

        return;
        }
      }    /* END for (i) */

    /*
    **	Open two sockets for use by demux program later.
    */

    for (i = 0;i < 2;i++)
      socks[i] = -1;

    for (i = 0;i < 2;i++) 
      {
      if ((socks[i] = socket(AF_INET,SOCK_STREAM,0)) == -1)
        break;
	
      memset(&saddr,'\0',sizeof(saddr));

      saddr.sin_addr.s_addr = INADDR_ANY;
      saddr.sin_family = AF_INET;

      if (bind(
           socks[i], 
           (struct sockaddr *)&saddr,
           sizeof(saddr)) == -1)
        {
        break;
        }
	
      slen = sizeof(saddr);

      if (getsockname(socks[i],(struct sockaddr *)&saddr,&slen) == -1)
        break;

      ports[i] = (int)ntohs(saddr.sin_port);
      }  /* END for (i) */

    if (i < 2) 
      {
      log_err(errno,id,"stdout/err socket");

      for (i = 0;i < 2;i++) 
        {
        if (socks[i] != -1)
          close(socks[i]);
        }

      /* command sisters to abort job and continue */

      exec_bail(pjob,JOB_EXEC_FAIL1);

      return;
      }

    pjob->ji_stdout = socks[0];
    pjob->ji_stderr = socks[1];

    /*
    **	Send out a JOIN_JOB message to all the MOM's in the sisterhood.
    */

    /* NOTE:  does not check success of join request */

    for (i = 1;i < nodenum;i++) 
      {
      np = &pjob->ji_hosts[i];
      stream = np->hn_stream;
	
      ep = event_alloc(IM_JOIN_JOB,np,TM_NULL_EVENT,TM_NULL_TASK);

      /* im_compose() will succeed even if mom is down */

      im_compose(
        stream, 
        pjob->ji_qs.ji_jobid,
        pjob->ji_wattr[(int)JOB_ATR_Cookie].at_val.at_str,
        IM_JOIN_JOB, 
        ep->ee_event, 
        TM_NULL_TASK);

      diswsi(stream,i);	        /* nodeid of receiver */
      diswsi(stream,nodenum);	/* number of nodes */
      diswsi(stream,ports[0]);	/* out port number */
      diswsi(stream,ports[1]);	/* err port number */
	
      /* write jobattrs */

      psatl = (svrattrl *)GET_NEXT(phead);

      encode_DIS_svrattrl(stream,psatl);

      /* rpp_flush() will succeed even if MOM is down */

      if (rpp_flush(stream) != 0)
        {
        sprintf(log_buffer,"ALERT:  unable to send join_job message to %s",
          np->hn_host);

        log_err(errno,id,log_buffer);
        }
      }  /* END for (i) */

    free_attrlist(&phead);
    }  /* END if (nodenum > 1) */ 
  else 
    {
    int SC;
    int RC;
  
    int Count;

    pjobexec_t *TJE;

    /* single node job - no sisters */

    ports[0] = -1;
    ports[1] = -1;

    pjob->ji_stdout = -1;
    pjob->ji_stderr = -1;

    if (TMOMJobGetStartInfo(NULL,&TJE) == FAILURE)
      {
      sprintf(log_buffer,"ALERT:  cannot locate available job slot");

      log_record(
        PBSEVENT_ERROR,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);

      /* on failure, TJE is NULL */

      exec_bail(pjob,JOB_EXEC_RETRY);

      return;
      }

    if (TMomFinalizeJob1(pjob,TJE,&SC) == FAILURE)
      {
      /* FAILURE (or at least do not continue) */

      if (SC != 0)
        {
        memset(TJE,0,sizeof(pjobexec_t));

        exec_bail(pjob,SC);
        }

      return;
      }

    /* TMomFinalizeJob2() blocks until job is fully launched */
 
    if (TMomFinalizeJob2(TJE,&SC) == FAILURE)
      {
      if (SC != 0)
        {
        memset(TJE,0,sizeof(pjobexec_t));

        exec_bail(pjob,SC);
        }

      return;
      }

    if (TMomCheckJobChild(TJE,TJobStartBlockTime,&Count,&RC) == FAILURE)
      {
      if (LOGLEVEL >= 3)
        {
        sprintf(log_buffer,"job not ready after %ld second timeout, MOM will recheck",
          TJobStartBlockTime);

        log_record(
          PBSEVENT_ERROR,
          PBS_EVENTCLASS_JOB,
          pjob->ji_qs.ji_jobid,
          log_buffer);
        }

      return;
      }

    /* NOTE:  TMomFinalizeJob3() populates SC */

    if (TMomFinalizeJob3(TJE,Count,RC,&SC) == FAILURE)
      {
      /* no need to log an error, TMomFinalizeJob3 already does it */

      memset(TJE,0,sizeof(pjobexec_t));

      exec_bail(pjob,SC);

      return;
      }

    /* SUCCESS:  MOM returns */

    if (LOGLEVEL >= 3)
      {
      sprintf(log_buffer,"job successfully started");

      log_record(
        PBSEVENT_ERROR,
        PBS_EVENTCLASS_JOB,
        pjob->ji_qs.ji_jobid,
        log_buffer);
      }

    /* clear old TJE */

    memset(TJE,0,sizeof(pjobexec_t));
    }  /* END else (nodenum > 1) */

  /* SUCCESS */

  if (LOGLEVEL >= 3)
    {
    sprintf(log_buffer,"job %s reported successful start on %d node(s)",
      pjob->ji_qs.ji_jobid,
      nodenum);

    LOG_EVENT(
      PBSEVENT_JOB,
      PBS_EVENTCLASS_JOB,
      pjob->ji_qs.ji_jobid,
      log_buffer);
    }

  return;
  }  /* END start_exec() */





/*
 * fork_me - fork mom, close all other connections and set default signal actions
 */

pid_t fork_me(

  int conn)  /* I */

  {
  struct sigaction act;
  pid_t		 pid;

  fflush(stdout);
  fflush(stderr);

  pid = fork();

  if (pid == 0) 
    {
    /* now the child */

    /* Turn off alarm if it should happen to be on */

    alarm(0);

    rpp_terminate();

    /* Reset signal actions for most to SIG_DFL */

    sigemptyset(&act.sa_mask);

    act.sa_flags   = 0;
    act.sa_handler = SIG_DFL;

    sigaction(SIGCHLD,&act,(struct sigaction *)0);

#ifdef _CRAY
    sigaction(WJSIGNAL,&act,(struct sigaction *)0);
#endif	/* _CRAY */

    sigaction(SIGHUP,&act,(struct sigaction *)0);
    sigaction(SIGINT,&act,(struct sigaction *)0);
    sigaction(SIGTERM,&act,(struct sigaction *)0);

    /* reset signal mask */

    sigprocmask(SIG_SETMASK,&act.sa_mask,NULL);

    mom_close_poll();

    /* NOTE:  close logfile, lockfile, and connection to server (NYI) */

    if (lockfds >= 0)
      {
      close(lockfds);
 
      lockfds = -1;
      }

    log_close(0);

    net_close(conn);	/* close all but for the current */

    /* release mlock; it seems to be inherited even though the
     * man page claims otherwise */

#ifdef PPINMEM
    munlockall();
#endif /* PPINMEM */
    } 
  else if (pid < 0)
    {
    log_err(errno,"fork_me","fork failed");
    }

  return(pid);
  }  /* END fork_me() */





/*
 * starter_return - return starter value, 
 *	exit if negative
 */

static void starter_return(

  int                  upfds,
  int                  downfds,
  int                  code,
  struct startjob_rtn *sjrtn)    /* I */

  {
  struct startjob_rtn ack;
  int i;

  sjrtn->sj_code = code;
  
  write(upfds,(char *)sjrtn,sizeof(*sjrtn));

  if (code < 0)
    close(upfds);

  /* wait for acknowledgement */

  do 
    {
    i = read(downfds,&ack,sizeof(ack));

    if ((i == -1) && (errno != EINTR))
      {
      break;
      }
    } while (i < 0);

  close(downfds);

  if (code < 0) 
    {
    exit(254);
    }

  return;
  }  /* END starter_return() */




	
/*
 * std_file_name - generate the fully qualified path/name for a
 *		   job standard stream
 */

char *std_file_name(

  job		*pjob,
  enum job_file	 which,
  int		*keeping)	/* RETURN */

  {
  static char  path[MAXPATHLEN + 1];
  char  key;
  int   len;
  char *pd;
  char *suffix;
  char *jobpath = NULL;

#if NO_SPOOL_OUTPUT == 1
  struct stat myspooldir;
  static char  path_alt[MAXPATHLEN + 1];
  int   rcstat;
#endif /* NO_SPOOL_OUTPUT */

  if ((pjob->ji_wattr[(int)JOB_ATR_interactive].at_flags & ATR_VFLAG_SET) &&
      (pjob->ji_wattr[(int)JOB_ATR_interactive].at_val.at_long > 0)) 
    {
    /* interactive job, name of pty is in outpath */

    *keeping = 0;

    return(pjob->ji_wattr[(int)JOB_ATR_outpath].at_val.at_str);
    }

  if (pjob->ji_grpcache == NULL)
    {
    /* ji_grpcache required for gc_homedir information */

    return(NULL);
    }

  switch (which) 
    {
    case StdOut:

      key = 'o';
      suffix = JOB_STDOUT_SUFFIX;

      if (pjob->ji_wattr[(int)JOB_ATR_outpath].at_flags & ATR_VFLAG_SET)
        {
        jobpath = pjob->ji_wattr[(int)JOB_ATR_outpath].at_val.at_str;
        }

      break;

    case StdErr:

      key    = 'e';
      suffix = JOB_STDERR_SUFFIX;
    
      if (pjob->ji_wattr[(int)JOB_ATR_errpath].at_flags & ATR_VFLAG_SET)
        {
        jobpath = pjob->ji_wattr[(int)JOB_ATR_errpath].at_val.at_str;
        }

      break;

    case Chkpt:
    default:

      key = '\001';	/* should never be found */
      suffix = JOB_CKPT_SUFFIX;

      break;
    }  /* END switch(which) */

  /* Is file to be kept?, if so use default name in Home directory */

  if ((pjob->ji_wattr[(int)JOB_ATR_keep].at_flags & ATR_VFLAG_SET) &&
      (strchr(pjob->ji_wattr[(int)JOB_ATR_keep].at_val.at_str,key))) 
    {
    /* yes, it is to be kept */

    strcpy(path,pjob->ji_grpcache->gc_homedir);
		
    pd = strrchr(pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str,'/');

    if (pd == NULL) 
      {
      pd = pjob->ji_wattr[(int)JOB_ATR_jobname].at_val.at_str;

      strcat(path,"/");
      }

    strcat(path,pd);	            /* start with the job name */

    len = strlen(path);

    *(path + len++) = '.';          /* the dot        */
    *(path + len++) = key;	    /* the letter     */

    pd = pjob->ji_qs.ji_jobid;      /* the seq_number */

    while (isdigit((int)*pd))       
      *(path + len++) = *pd++;

    *(path + len) = '\0';

    *keeping = 1;
    } 
  else 
    {

    /* don't bother keeping output if the user actually wants to discard it */

    if ((jobpath != NULL) && (*jobpath != '\0'))
      {
      char *ptr;

      if ((ptr=strchr(jobpath,':')) != NULL)
        {
        jobpath = ptr+1;
        }

      if (!strcmp(jobpath,"/dev/null"))
        {
        strcpy(path,"/dev/null");

        *keeping = 1;

        return(path);
        }
      }

    /* put into spool directory unless NO_SPOOL_OUTPUT is defined */

#if NO_SPOOL_OUTPUT == 1		

    /* force all output to user's HOME */

    strncpy(path,pjob->ji_grpcache->gc_homedir,sizeof(path));
		
    /* check for $HOME/.pbs_spool */ 
    /* if it's not a directory, just use $HOME us usual */

    strncpy(path_alt,path,sizeof(path_alt));

    strncat(path_alt,"/.pbs_spool/",sizeof(path_alt));

#if defined(HAVE_SETEUID) && defined(HAVE_SETEGID)

    /* most systems */

    if (seteuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid) == -1)
      {
      return(NULL);
      }

    rcstat = stat(path_alt,&myspooldir);
		
    seteuid(0);

#elif defined(HAVE_SETRESUID) && defined(HAVE_SETRESGID)

    /* HPUX and the like */

    if (setresuid(-1,pjob->ji_qs.ji_un.ji_momt.ji_exuid,-1) == -1)
      {
      return(NULL);
      }

    rcstat = stat(path_alt,&myspooldir);

    setresuid(-1,0,-1);
#endif  /* HAVE_SETRESUID and friends */

    if ((rcstat == 0) && (S_ISDIR(myspooldir.st_mode)))
      strncpy(path,path_alt,sizeof(path));
    else
      strncat(path,"/",sizeof(path));

    *keeping = 1;

#else	/* NO_SPOOL_OUTPUT */

    strncpy(path,path_spool,sizeof(path));

    *keeping = 0;

#endif	/* NO_SPOOL_OUTPUT */

    strncat(path,pjob->ji_qs.ji_fileprefix,(sizeof(path) - strlen(path) - 1));
    strncat(path,suffix,(sizeof(path) - strlen(path) - 1));
    }  /* END else */

  return(path);
  }  /* END std_file_name() */




/*
 * open_std_file - open either standard output or standard error for the job.
 */

int open_std_file(

  job		*pjob,
  enum job_file	 which,		/* which file */
  int		 mode,		/* file mode */
  gid_t		 exgid)		/* gid for file */

  {
  int   fds;
  int   keeping;
  char *path;
  int   old_umask;
  struct stat statbuf;

  if ((path = std_file_name(pjob,which,&keeping)) == NULL)
    {
    return(-1);
    }

  /* these checks are a bit complicated.  If keeping, we do what the user
   * says.  Otherwise, make sure we aren't following a symlink and that
     the user owns the file without breaking /dev/null. */

  if (keeping)
    {
    mode &= ~O_EXCL; 
    }
  else
    {
    if (lstat(path,&statbuf) == 0)
      {
      if (S_ISLNK(statbuf.st_mode))
        {
        log_err(-1,"open_std_file","std file is symlink, someone is doing something fishy");

        return(-1);
        }

      if (S_ISREG(statbuf.st_mode))
        {
        if (statbuf.st_uid != pjob->ji_qs.ji_un.ji_momt.ji_exuid)
          {
          log_err(-1,"open_std_file","std file exists with the wrong owner, someone is doing something fishy");

          return(-1);
          }
        if ((statbuf.st_gid != exgid) && (statbuf.st_gid != 0))
          {
          log_err(-1,"open_std_file","std file exists with the wrong group, someone is doing something fishy");

          return(-1);
          }
        }

      /* seems reasonably safe to append to the existing file */
      mode &= ~(O_EXCL|O_CREAT); 
      }
    }

#if defined(HAVE_SETEUID) && defined(HAVE_SETEGID)

    /* most systems */

    if ((setegid(exgid) == -1) || 
        (seteuid(pjob->ji_qs.ji_un.ji_momt.ji_exuid) == -1))
      {
      return(-1);
      }

    if (pjob->ji_wattr[(int)JOB_ATR_umask].at_flags & ATR_VFLAG_SET)
      {
      old_umask = umask(pjob->ji_wattr[(int)JOB_ATR_umask].at_val.at_long);
      }
    fds = open(path,mode,0666);
    if (pjob->ji_wattr[(int)JOB_ATR_umask].at_flags & ATR_VFLAG_SET)
      {
      umask(old_umask);
      }

    seteuid(0);
    setegid(pbsgroup);

#elif defined(HAVE_SETRESUID) && defined(HAVE_SETRESGID)

    /* HPUX and the like */

    if ((setresgid(-1,exgid,-1) == -1) ||
        (setresuid(-1,pjob->ji_qs.ji_un.ji_momt.ji_exuid,-1) == -1))
      {
      return(-1);
      }


    if (pjob->ji_wattr[(int)JOB_ATR_umask].at_flags & ATR_VFLAG_SET)
      {
      old_umask = umask(pjob->ji_wattr[(int)JOB_ATR_umask].at_val.at_long);
      }
    fds = open(path,mode,0666);
    if (pjob->ji_wattr[(int)JOB_ATR_umask].at_flags & ATR_VFLAG_SET)
      {
      umask(old_umask);
      }

    setresuid(-1,0,-1);
    setresgid(-1,pbsgroup,-1);

#else	/* Neither */
    Crash and Burn - need seteuid/setegid or need setresuid/setresgid
#endif	/* HAVE_SETRESUID */

  return(fds);
  }  /* END open_std_file() */





/*
 * find_env_slot - find if the environment variable is already in the table,
 *	If so, replease existing one with new one.
 */

static int find_env_slot(

  struct var_table *ptbl,
  char             *pstr)

  {
  int	 i;
  int	 len = 1;	/* one extra for '=' */

  for (i = 0;(*(pstr + i) != '\0') && (*(pstr + i) != '=');++i)
    ++len;

  for (i = 0;i < ptbl->v_used;++i) 
    {
    if (strncmp(ptbl->v_envp[i],pstr,len) == 0)
      {
      return(i);
      }
    }  /* END for (i) */

  return(-1);
  }  /* END find_env_slot() */




/*
 * bld_env_variables - build up the array of environment variables which are
 *	passed to the job.
 *
 *	Value may be null if total string (name=value) is included in "name".
 */

void bld_env_variables(

  struct var_table *vtable,
  char             *name,
  char             *value)

  {
  int amt;
  int i;

  if (vtable->v_used == vtable->v_ensize)
    {
    return;			/* no room for pointer */
    }

  amt = strlen(name) + 1;

  if (value != NULL)
    amt += strlen(value) + 1;	/* plus 1 for "="     */

  if (amt > vtable->v_bsize)	 	/* no room for string */
    {
    return;
    }

  strcpy(vtable->v_block, name);

  if (value != NULL) 
    {
    strcat(vtable->v_block,"=");
    strcat(vtable->v_block,value);
    }

  if ((i = find_env_slot(vtable,vtable->v_block)) < 0) 
    {
    *(vtable->v_envp + vtable->v_used++) = vtable->v_block;
    } 
  else 
    {
    *(vtable->v_envp + i) = vtable->v_block;
    }

  vtable->v_block += amt;
  vtable->v_bsize -= amt;

  return;
  }  /* END bld_env_variables() */

	


#ifndef __TOLDGROUP

/*
  * init_groups - build the group list via an LDAP friendly method
  */

int init_groups(

  char *pwname,   /* I User's name */
  int   pwgrp,    /* I User's group from pw entry */
  int   groupsize,/* I size of the array, following argument */
  int  *groups)   /* O ptr to group array, list build there */

  {
  /* DJH Jan 2004. The original implementation looped over all groups
     looking for membership. Thats OK for /etc/groups, but thrashes LDAP
     if you're using that for groups in nsswitch.conf. Since there is an
     explicit LDAP backend to do initgroups (3) efficiently in nss_ldap
     (on Linux), lets use initgroups() to figure out the group
     membership. A little clunky, but not too ugly.  */

  /* return -1 on failure */

  char id[]="init_groups";
  extern sigset_t allsigs; /* set up at the start of mom_main */
  sigset_t savedset;

  int n, nsaved;
  gid_t savedgroups[NGROUPS_MAX + 1]; /* plus one for the egid below */

  gid_t momegid;
  int i;

  /* save current group access because we're about to overwrite it */

  nsaved = getgroups(NGROUPS_MAX,savedgroups);

  if (nsaved < 0) 
    {
    log_err(errno,id,"getgroups");

    return(-1);
    }

  /* From the Linux man page: It is unspecified whether the effective
     group ID of the calling process is included in the returned
     list. (Thus, an application should also call getegid(2) and add
     or remove the resulting value.)
  */

  momegid = getegid();

  /* search for duplicates */

  for (i = 0;i < nsaved;i++)
    {
    if (savedgroups[i] == momegid)
      break;
    }

  if (i >= nsaved) 
    savedgroups[nsaved++] = momegid;

  if (pwgrp == 0) 
    {
    /* Emulate the original init_groups() behaviour which treated
       gid==0 as a special case */

    struct passwd *pwe = getpwnam(pwname);

    if (pwe == NULL) 
      {
      log_err(errno,id,"no such user");

      return(-1);
      }

    pwgrp = pwe->pw_gid;
    }

  /* Block signals while we do this or else the signal handler might
     run with strange group access */

  if (sigprocmask(SIG_BLOCK,&allsigs,&savedset) == -1) 
    {
    log_err(errno,id,"sigprocmask(BLOCK)");

    return(-1);
    }

  n = 0;

  if (initgroups(pwname,pwgrp) < 0) 
    {
    log_err(errno,id,"initgroups");

    n = -1;
    } 
  else 
    {
    n = getgroups(groupsize,(gid_t *)groups);
    }

  /* restore state */

  if (setgroups(nsaved,savedgroups) < 0)
    log_err(errno,id,"setgroups");

  if (sigprocmask(SIG_SETMASK,&savedset,NULL) == -1)
    log_err(errno,id,"sigprocmask(SIG_SETMASK)");

  return(n);
  }  /* END init_groups() */

#else /* !__TOLDGROUP */

/*
 * init_groups - read the /etc/group file and build an array of
 *      group memberships for user pwname.
 */

int init_groups(

  char *pwname,	   /* I User's name */
  int   pwgrp,	   /* I User's group from pw entry */
  int   groupsize, /* I size of the array, following argument */
  int  *groups)	   /* O ptr to group array, list build there */

  {
  struct group *grp;
  int i;
  int n;

  n = 0;

  if (pwgrp != 0)
    *(groups + n++) = pwgrp;

  setgrent();

  while ((grp = getgrent())) 
    {
    if (grp->gr_gid == (gid_t)pwgrp)
      continue;
		
    for (i = 0;grp->gr_mem[i];i++)
      {
      if (!strcmp(grp->gr_mem[i],pwname)) 
        {
        if (n == groupsize) 
          {
          endgrent();

          return(-1);
          }

        *(groups + n++) = grp->gr_gid;
        }
      }
    }    /* END while (grp) */

  endgrent();

  return(n);
  }  /* END init_groups() */

#endif /* !__TOLDGROUP */



/*
 * catchinter = catch death of writer child and/or shell child of interactive
 *	When one dies, kill off the other; there is no mercy in this family.
 */	

static void catchinter(

  int sig)  /* I (not used) */

  {
  int   status;
  pid_t pid;

  pid = waitpid(-1,&status,WNOHANG);

  if (pid == 0)
    {
    return;
    }

  if (pid == writerpid) 
    {
    kill(shellpid,SIGKILL);

    wait(&status);
    } 
  else 
    {
    kill(writerpid,SIGKILL);

    wait(&status);
    }

  mom_reader_go = 0;

  return;
  }  /* END catchinter() */





/*
 * Look for a certain environment variable which has a port# which should
 * be opened on the MS to establish communication for one of the 3 stdio
 * streams.  >=0 return is that valid fd, -1 means no env var found,
 * -2 means malformed env value or failure to connect.
 */

static int search_env_and_open(

  const char *envname, 
  u_long      ipaddr)

  {
  static char *id = "search_env_and_open";
  int i, len = strlen(envname);

  for (i = 0;i < vtable.v_used;i++)
    {
    if (!strncmp(vtable.v_envp[i],envname,len)) 
      {
      const char *cp = vtable.v_envp[i] + len;
      char *cq;
      int fd, port;

      if (*cp++ != '=') 
        break;  /* empty, ignore it */

      port = strtol(cp,&cq,10);

      if (*cq) 
        {
        sprintf(log_buffer,"improper value for %s", envname);
        log_err(errno,id,log_buffer);

        return(-2);
        }

#if 0       /* debugging */
     log_err(0, "search_env_and_open attempting open", vtable.v_envp[i]);
#endif

      if ((fd = open_demux(ipaddr,port)) < 0) 
        {
        sprintf(log_buffer,"failed connect to stdio on %s:%d",vtable.v_envp[i],port);
        log_err(errno,id,log_buffer);

        return(-2);
        }

      return(fd);
      }
    }    /* END for (i) */

  /* not found */

  return(-1);  
  }  /* END search_env_and_open() */




int TMomCheckJobChild(

  pjobexec_t *TJE,      /* I */
  int         Timeout,  /* I (in seconds) */
  int        *Count,    /* O (bytes read) */
  int        *RC)       /* O (return code/errno) */

  {
  int i;

  fd_set fdset;

  int rc;

  struct timeval timeout;

  /* NOTE:  assume if anything is on pipe, everything is on pipe 
            (may reasult in hang) */

  /* block up to timeout, wait for child to complete indicating 
     success/failure of job launch */

  /* read returns the session id or error */

  timeout.tv_sec  = Timeout;
  timeout.tv_usec = 0;

  FD_ZERO(&fdset);

  FD_SET(TJE->jsmpipe[0],&fdset);

  rc = select(
         TJE->jsmpipe[0] + 1,
         &fdset,
         (fd_set *)NULL,
         (fd_set *)NULL,
         &timeout);

  if (rc <= 0)
    {
    /* data not yet available */

    return(FAILURE);
    }

  for (;;)
    {
    i = read(
          TJE->jsmpipe[0],
          (char *)&TJE->sjr,
          sizeof(struct startjob_rtn));

    if ((i == -1) && (errno == EINTR))
      continue;

    break;
    }

  *RC = errno;
  *Count = i;

  return(SUCCESS);
  }  /* END TMomCheckJobChild() */





#ifdef PENABLE_DYNAMIC_CPUSETS2

/**********************************************************************/
/*                                                                    */
/* Setup call to dynamic cpuset allocator (0 = SUCCESS, <0 = FAILURE) */
/*                                                                    */
/**********************************************************************/

int execute_dynamo( 

  int   iwhich,   /* I 0 creates cpuset; non-zero to delete */
  char *cjobid,   /* I the PBS JOBID */
  int   iuid,     /* I the UID for the cpuset */
  int   igid,     /* I the GID for the cpuset (ignored) */
  char *cpath,    /* I the PATH for PBS_HOME to get to node file */
  char *cQueue)   /* O queue created/deleted */

  {
  static char     *id = "execute_dynamo";

  int             irc, iimages[MAXIMAGES], ii;
  int             itotimages = 0;
  struct stat     stbuf;
  char            cMyName[MAXIMAGES][256]; 
  char            cBuf[(2 * MAXPATHLEN) + 4];
  char           *cp;
  char           *cmd = "/usr/local/sbin/pbs_dynamo";

  FILE           *fp;

  /*  clear the number of CPUs to reserve on each host */

  for (ii = 0;ii < MAXIMAGES;ii++)
    {
    iimages[ii] = 0;
    }

  /* parse hostfile */

  strcpy(cBuf,cpath);
  strcat(cBuf,"/aux/");
  strcat(cBuf,cjobid);

  fp = fopen(cBuf,"r");

  ii = 1;

  while (fgets(cBuf,sizeof(cBuf),fp)) 
    {
    cp = strchr(cBuf,'\n');

    *cp = '\0';

    /* on first pass, initialize host name */

    if (ii != 0) 
      {
      strcpy(cMyName[itotimages],cBuf);

      iimages[itotimages]++;

      ii = 0;
      } 
    else 
      {
      /* Count CPUs assigned. */

      if (!strncmp(cBuf,cMyName[itotimages],sizeof(cMyName[itotimages]))) 
        {
        iimages[itotimages]++;
        } 
      else 
        {
        /* Switch hosts and go to new CPU counter */

        itotimages++;
        
        strcpy(cMyName[itotimages],cBuf);

        iimages[itotimages]++;
        }
      }
    }

  fclose(fp);

  /* Make sure that the allocator has been installed */

  irc = stat(cmd,&stbuf);

  if (!irc) 
    {
    /* Create queue name, truncated jobid; but, must be at */
    /*   least 3 characters long */

    irc = (int)(strchr(cjobid,'.') - cjobid);

    for (ii = irc;ii < 3;ii++)
      strcat(cQueue,"0");

    strncat(cQueue,cjobid,irc);

    for (ii = 0;ii <= itotimages;ii++) 
      {
      /* Construct the allocator command */

      strcpy(cBuf,K5RSH);          /* K5 rsh     */
      strcat(cBuf," ");
      strcat(cBuf,cMyName[ii]);    /* each host  */
      strcat(cBuf," ");
      strcat(cBuf,cmd);            /* allocator  */

      if (iwhich) 
        {  
        /* iwhich=1 destroys cpuset */

        strcat(cBuf," -d ");
        } 
      else 
        {        
        /* iwhich=0 creates cpuset */

        sprintf(&cBuf[strlen(cBuf)]," -u %d",
          iuid);  /* for UID */

        sprintf(&cBuf[strlen(cBuf)]," -g %d",
          igid );         /* for GID    */

        strcat(cBuf," -c ");
        }

      strcat(cBuf,cQueue);         /* for queue  */
      strcat(cBuf," -p ");

      sprintf(&cBuf[strlen(cBuf)],"%d",
        iimages[itotimages]);  /* for NCPUs  */

      strcat(cBuf," >/dev/null 2>&1");

      /* call the allocator */

      irc = system(cBuf);
      }  /* END for (ii) */
    }    /* END if (!irc) */
  
  return(0);
  }  /* END execute_dynamo() */

#endif  /* PENABLE_DYNAMIC_CPUSETS2 */



/* END start_exec.c */
