#include "license_pbs.h" /* See here for the software license */ #include /* the master config generated by configure */ #include "lib_mom.h" /* header */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* needed for oom_adj */ #include #ifdef Q_6_5_QUOTAON /* remap dqblk for SUSE 9.0 */ #define dqblk if_dqblk #endif /* Q_6_5_QUOTAON */ /* #ifndef dqblk #include #define dqblk v1_disk_dqblk #endif */ #include "pbs_error.h" #include "portability.h" #include "list_link.h" #include "server_limits.h" #include "attribute.h" #include "resource.h" #include "pbs_job.h" #include "log.h" #include "mom_mach.h" #include "mom_func.h" #include "resmon.h" #include "utils.h" #include "../rm_dep.h" #include "pbs_nodes.h" #ifdef PENABLE_LINUX26_CPUSETS #include "pbs_cpuset.h" #endif /* ** System dependent code to gather information for the resource ** monitor for a Linux i386 machine. ** ** Resources known by this code: ** cput cpu time for a pid or session ** mem memory size for a pid or session in KB ** resi resident memory size for a pid or session in KB ** sessions list of sessions in the system ** pids list of pids in a session ** nsessions number of sessions in the system ** nusers number of users in the system ** totmem total memory size in KB ** availmem available memory size in KB ** ncpus number of cpus ** physmem physical memory size in KB ** size size of a file or filesystem ** idletime seconds of idle time ** walltime wall clock time for a pid ** loadave current load average ** quota quota information (sizes in kb) ** netload number of bytes transferred for all interfaces */ #ifndef MAX_LINE #define MAX_LINE 1024 #endif #ifndef TRUE #define FALSE 0 #define TRUE 1 #endif /* TRUE */ static char procfs[] = "/proc"; static DIR *pdir = NULL; static int pagesize; extern char *ret_string; extern char extra_parm[]; extern char no_parm[]; extern char mom_host[]; extern time_t time_now; extern int LOGLEVEL; extern char PBSNodeMsgBuf[MAXLINE]; #define TBL_INC 200 /* initial proc table */ #define PMEMBUF_SIZE 2048 static proc_stat_t *proc_array = NULL; static int nproc = 0; static int max_proc = 0; /* ** external functions and data */ extern tlist_head svr_alljobs; extern struct config *search(struct config *,char *); extern struct rm_attribute *momgetattr(char *); extern int rm_errno; extern double cputfactor; extern double wallfactor; extern long system_ncpus; extern int ignwalltime; extern int igncput; extern int ignvmem; extern int ignmem; extern int job_oom_score_adjust; extern int mom_oom_immunize; #ifdef PENABLE_LINUX26_CPUSETS extern int memory_pressure_threshold; extern short memory_pressure_duration; #endif #ifdef NUMA_SUPPORT extern int num_node_boards; extern nodeboard node_boards[]; extern int numa_index; #else extern char path_meminfo[MAX_LINE]; #endif /* NUMA_SUPPORT */ /* ** local functions and data */ static const char *resi (struct rm_attribute *); static const char *totmem (struct rm_attribute *); static const char *availmem (struct rm_attribute *); static const char *physmem (struct rm_attribute *); static const char *ncpus (struct rm_attribute *); static const char *walltime (struct rm_attribute *); static const char *quota (struct rm_attribute *); static const char *netload (struct rm_attribute *); #ifdef NUMA_SUPPORT const char *cpuact (struct rm_attribute *); #endif #ifdef USELIBMEMACCT #ifdef __cplusplus extern "C" { #endif long long get_memacct_resi(pid_t pid); extern long get_weighted_memory_size(pid_t); #ifdef __cplusplus } #endif #endif #ifndef mbool_t #define mbool_t char #endif /* mbool_t */ mbool_t ProcIsChild(char *,pid_t,char *); extern const char *loadave(struct rm_attribute *); extern const char *nullproc(struct rm_attribute *); time_t wait_time = 10; #ifdef NUMA_SUPPORT typedef struct proc_cpu { unsigned long long idle_total; unsigned long long busy_total; } proc_cpu_t; static proc_cpu_t *cpu_array = NULL; #endif /* ** local resource array */ struct config dependent_config[] = { { "resi", {resi} }, { "totmem", {totmem} }, { "availmem", {availmem} }, { "physmem", {physmem} }, { "ncpus", {ncpus} }, #ifdef NUMA_SUPPORT { "loadave", {cpuact} }, #else { "loadave", {loadave} }, #endif { "walltime", {walltime} }, { "quota", {quota} }, { "netload", {netload} }, { "size", {size} }, { NULL, {nullproc} } }; unsigned linux_time = 0; /* * support routine for getting system time -- sets linux_time */ void proc_get_btime(void) { FILE *fp; char label[256]; if ((fp = fopen("/proc/stat", "r")) == NULL) { return; } while (!feof(fp)) { if (fscanf(fp, "%s", label) != 1) { fclose(fp); return; } if (strcmp(label, "btime")) { if (fscanf(fp, "%*[^\n]%*c") != 0) { fclose(fp); return; } } else { if (fscanf(fp, "%u", &linux_time) != 1) {} fclose(fp); return; } } /* END while (!feof(fp)) */ fclose(fp); return; } /* END proc_get_btime() */ /* NOTE: see 'man 5 proc' for /proc/pid/stat format and description */ /* NOTE: leading '*' indicates that field should be ignored */ /* FORMAT: [] [] [] [] [] [] [] [] [<0>] [] [] [] ... */ static char stat_str[] = " %c %d %d %d %*d %*d %u %*u \ %*u %*u %*u %lu %lu %lu %lu %*ld %*ld %*u %*ld %lu %llu %lld %*lu %*lu \ %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu"; /* * Convert jiffies to seconds. * * Hertz is sysconf(_SC_CLK_TCK) in get_proc_stat() */ #define JTOS(x) (x) / Hertz; /* * Linux /proc status routine. * * Returns a pointer to a static proc_stat_t structure given * a process number, or NULL if there is an error. Takes the * place of the ioctl call PIOCSTATUS in the irix imp of mom_mach.c * */ proc_stat_t *get_proc_stat( int pid) /* I */ { static proc_stat_t ps; static char path[MAXLINE]; static char readbuf[MAXLINE << 2]; static char *lastbracket; FILE *fd; unsigned long jstarttime; /* number of jiffies since OS start time when process started */ struct stat sb; static int Hertz = 0; int Hertz_errored = 0; if (Hertz <= 0) { Hertz = sysconf(_SC_CLK_TCK); /* returns 0 on error */ if (Hertz <= 0) { /* FAILURE */ if (!Hertz_errored) log_err(errno, "get_proc_stat", "sysconf(_SC_CLK_TCK) failed, unable to monitor processes"); Hertz_errored = 1; return(NULL); } } Hertz_errored = 0; sprintf(path, "/proc/%d/stat", pid); if ((fd = fopen(path, "r")) == NULL) { /* FAILURE */ return(NULL); } /* use 'man 5 proc' for /proc/pid/stat format */ if (!fgets(readbuf, sizeof(readbuf), fd)) { fclose(fd); return(NULL); } lastbracket = strrchr(readbuf, ')'); if (lastbracket == NULL) { fclose(fd); return(NULL); } *lastbracket = '\0'; /* We basically split the string here, overwriting the ')'. */ lastbracket++; if (sscanf(readbuf,"%d (%[^\n]",&ps.pid,path) != 2) { /* FAILURE */ fclose(fd); return(NULL); } /* see stat_str[] value for mapping 'stat' format */ if (sscanf(lastbracket,stat_str, &ps.state, /* state (one of RSDZTW) */ &ps.ppid, /* ppid */ &ps.pgrp, /* pgrp */ &ps.session, /* session id */ &ps.flags, /* flags - kernel flags of the process, see the PF_* in */ &ps.utime, /* utime - jiffies that this process has been scheduled in user mode */ &ps.stime, /* stime - jiffies that this process has been scheduled in kernel mode */ &ps.cutime, /* cutime - jiffies that this process’s waited-for children have been scheduled in user mode */ &ps.cstime, /* cstime - jiffies that this process’s waited-for children have been scheduled in kernel mode */ &jstarttime, /* starttime */ &ps.vsize, /* vsize */ &ps.rss) != 12) /* rss */ { /* FAILURE */ fclose(fd); return(NULL); } if (fstat(fileno(fd), &sb) == -1) { /* FAILURE */ fclose(fd); return(NULL); } ps.uid = sb.st_uid; ps.start_time = linux_time + JTOS(jstarttime); ps.name = path; ps.utime = JTOS(ps.utime); ps.stime = JTOS(ps.stime); ps.cutime = JTOS(ps.cutime); ps.cstime = JTOS(ps.cstime); /* SUCCESS */ fclose(fd); return(&ps); } /* END get_proc_stat() */ #ifdef USELIBMEMACCT /* * Retrieve weighted RSS value for process with pid from memacctd. * Returns the value in bytes on success, returns -1 on failure. */ long long get_memacct_resi(pid_t pid) { long long w_rss; if ((w_rss = get_weighted_memory_size(pid)) == -1) { sprintf(log_buffer, "get_weighted_memory_size(%d) failed", pid); log_err(errno, __func__, log_buffer); } return(w_rss); } /* END get_memacct_resi() */ #endif /* * get_proc_mem_from_path() * @returns a pointer to a struct containing the memory information * @pre-cond: path must point to a valid path of a meminfo system file */ proc_mem_t *get_proc_mem_from_path( const char *path) { proc_mem_t *mm; FILE *fp; char str[32]; long long bfsz = -1; long long casz = -1; long long fcasz = -1; if ((fp = fopen(path,"r")) == NULL) { return(NULL); } mm = (proc_mem_t *)calloc(1, sizeof(proc_mem_t)); if (fscanf(fp,"%30s",str) != 1) { fclose(fp); return(NULL); } if (!strncmp(str,"total:",sizeof(str))) { /* old format */ if (fscanf(fp,"%*[^\n]%*c") != 0) /* remove text header */ { fclose(fp); return(NULL); } /* umu vmem patch */ if (fscanf(fp, "%*s %llu %llu %llu %*u %lld %lld", &mm->mem_total, &mm->mem_used, &mm->mem_free, &bfsz, &casz) != 5) { fclose(fp); return(NULL); } mm->mem_free += casz + bfsz; if (fscanf(fp, "%*s %llu %llu %llu %*[^\n]%*c", &mm->swap_total, &mm->swap_used, &mm->swap_free) != 3) { fclose(fp); return(NULL); } } else { do { /* new format (kernel > 2.4) the first 'str' has been read */ if (!strncmp(str, "MemTotal:", sizeof(str))) { if (fscanf(fp, "%llu", &mm->mem_total) != 1) { fclose(fp); return(NULL); } mm->mem_total *= 1024; /* the unit is kB */ } else if (!strncmp(str, "MemFree:", sizeof(str))) { if (fscanf(fp, "%llu", &mm->mem_free) != 1) { fclose(fp); return(NULL); } mm->mem_free *= 1024; } else if (!strncmp(str, "Buffers:", sizeof(str))) { if (fscanf(fp, "%lld", &bfsz) != 1) { fclose(fp); return(NULL); } bfsz *= 1024; } else if (!strncmp(str, "Cached:", sizeof(str))) { if (fscanf(fp, "%lld", &casz) != 1) { fclose(fp); return(NULL); } casz *= 1024; } else if (!strncmp(str, "FilePages:", sizeof(str))) { if (fscanf(fp, "%lld", &fcasz) != 1) { fclose(fp); return(NULL); } fcasz *= 1024; } else if (!strncmp(str, "SwapTotal:", sizeof(str))) { if (fscanf(fp, "%llu", &mm->swap_total) != 1) { fclose(fp); return(NULL); } mm->swap_total *= 1024; } else if (!strncmp(str, "SwapFree:", sizeof(str))) { if (fscanf(fp, "%llu", &mm->swap_free) != 1) { fclose(fp); return(NULL); } mm->swap_free *= 1024; } } while (fscanf(fp, "%30s", str) == 1); } /* END else */ fclose(fp); if (bfsz >= 0 || casz >= 0) { if (bfsz > 0) mm->mem_free += bfsz; if (casz > 0) mm->mem_free += casz; } else if (fcasz > 0) { mm->mem_free += fcasz; } return(mm); } /* END get_proc_mem_from_path() */ proc_mem_t *get_proc_mem(void) { static proc_mem_t ret_mm; #ifdef NUMA_SUPPORT int i; #else proc_mem_t *mem; #endif #ifdef NUMA_SUPPORT ret_mm.mem_total = 0; ret_mm.mem_used = 0; ret_mm.mem_free = 0; ret_mm.swap_total = 0; ret_mm.swap_used = 0; ret_mm.swap_free = 0; for (i = 0; i < node_boards[numa_index].num_nodes; i++) { proc_mem_t *node_mem = get_proc_mem_from_path(node_boards[numa_index].path_meminfo[i]); if (node_mem == NULL) return(NULL); ret_mm.mem_total += node_mem->mem_total; ret_mm.mem_used += node_mem->mem_used; ret_mm.mem_free += node_mem->mem_free; ret_mm.swap_total += node_mem->swap_total; ret_mm.swap_used += node_mem->swap_used; ret_mm.swap_free += node_mem->swap_free; free(node_mem); } #else mem = get_proc_mem_from_path(path_meminfo); if(mem == NULL) return (NULL); ret_mm.mem_total = mem->mem_total; ret_mm.mem_used = mem->mem_used; ret_mm.mem_free = mem->mem_free; ret_mm.swap_total = mem->swap_total; ret_mm.swap_used = mem->swap_used; ret_mm.swap_free = mem->swap_free; free(mem); #endif return(&ret_mm); } /* END get_proc_mem() */ #ifdef PNOT proc_mem_t *get_proc_mem(void) { static proc_mem_t mm; FILE *fp; unsigned long m_tot, m_use, m_free; unsigned long s_tot, s_use, s_free; if ((fp = fopen(path_meminfo, "r")) == NULL) { return(NULL); } fscanf(fp, "%*[^\n]%*c"); /* remove text header */; fscanf(fp, "%*s %lu %lu %lu %*[^\n]%*c", &m_tot, &m_use, &m_free); fscanf(fp, "%*s %lu %lu %lu %*[^\n]%*c", &s_tot, &s_use, &s_free); mm.total = m_tot + s_tot; mm.used = m_use + s_use; mm.free = m_free + s_free; fclose(fp); return(&mm); } /* END get_proc_mem() */ #endif /* PNOT */ /* * sets oom_adj score for current process * requires root privileges or CAP_SYS_RESOURCE to succeed */ static int oom_adj(int score) { pid_t pid; int rc,fd; char oom_adj_path[PATH_MAX] = ""; char adj_value[128] = ""; /* valid values are -17 to 15 */ if ( score > 15 || score < -17 ) return -1; pid = getpid(); if ( snprintf(oom_adj_path, sizeof(oom_adj_path), "/proc/%d/oom_adj", pid) < 0 ) return -1; if ( ( fd = open(oom_adj_path, O_RDWR) ) == -1 ) return -1; if (snprintf(adj_value,sizeof(adj_value),"%d",score) < 0) return -1; rc = write(fd,adj_value,strlen(adj_value)); close(fd); return rc; } void dep_initialize(void) { pagesize = getpagesize(); if ((pdir = opendir(procfs)) == NULL) { log_err(errno, __func__, "opendir"); return; } /* NOTE: /proc//oom_adj tunable is linux specific */ /* LKF: make pbs_mom processes immune to oom killer's killing frenzy if requested*/ if (mom_oom_immunize != 0) { if (oom_adj(-17) < 0) { log_record( PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, "failed to make pbs_mom oom-killer immune"); } else { log_record( PBSEVENT_SYSTEM, PBS_EVENTCLASS_SERVER, __func__, "mom is now oom-killer safe"); } } proc_get_btime(); return; } /* END dep_initialize() */ void dep_cleanup(void) { log_record(PBSEVENT_SYSTEM, 0, __func__, "dependent cleanup"); if (pdir) { closedir(pdir); pdir = NULL; } return; } /* * This routine is called on each cycle of the main loop. */ void dep_main_loop_cycle(void) { /* No periodic functions. */ } /* * Internal size decoding routine. * * Accepts a resource pointer and a pointer to the unsigned long integer * to receive the decoded value. It returns a PBS error code, and the * decoded value in the unsigned long integer. * * sizeof(word) = sizeof(int) */ static int mm_getsize( resource *pres, /* I */ unsigned long *ret) /* O */ { unsigned long value; if (pres->rs_value.at_type != ATR_TYPE_SIZE) { return(PBSE_ATTRTYPE); } value = pres->rs_value.at_val.at_size.atsv_num; if (pres->rs_value.at_val.at_size.atsv_units == ATR_SV_WORDSZ) { if (value > ULONG_MAX / sizeof(int)) { return(PBSE_BADATVAL); } value *= sizeof(int); } if (value > (ULONG_MAX >> pres->rs_value.at_val.at_size.atsv_shift)) { return(PBSE_BADATVAL); } *ret = (value << pres->rs_value.at_val.at_size.atsv_shift); return(PBSE_NONE); } /* END mm_getsize() */ /* * Internal time decoding routine. * * Accepts a resource pointer and a pointer to the unsigned long integer * to receive the decoded value. It returns a PBS error code, and the * decoded value of time in seconds in the unsigned long integer. */ static int mm_gettime( resource *pres, unsigned long *ret) { if (pres->rs_value.at_type != ATR_TYPE_LONG) { return(PBSE_ATTRTYPE); } if (pres->rs_value.at_val.at_long < 0) { return(PBSE_BADATVAL); } *ret = pres->rs_value.at_val.at_long; return(PBSE_NONE); } static int injob( job *pjob, pid_t sid) { task *ptask; pid_t pid; #ifdef PENABLE_LINUX26_CPUSETS struct pidl *pids = NULL; struct pidl *pp; #else proc_stat_t *ps; #endif /* PENABLE_LINUX26_CPUSETS */ for (ptask = (task *)GET_NEXT(pjob->ji_tasks); ptask != NULL; ptask = (task *)GET_NEXT(ptask->ti_jobtask)) { if (ptask->ti_qs.ti_sid <= 1) continue; if (ptask->ti_qs.ti_sid == sid) { return(TRUE); } } /* processes with a different sessionid are not necessarily not part of the job: the job can call setsid; need to check whether one of the parent processes has a sessionid that is in the job */ #ifdef PENABLE_LINUX26_CPUSETS /* check whether the sid is in the job's cpuset */ pids = get_cpuset_pidlist(pjob->ji_qs.ji_jobid, pids); pp = pids; while (pp != NULL) { pid = pp->pid; pp = pp->next; if (pid == sid) { free_pidlist(pids); return(TRUE); } } free_pidlist(pids); #else /* get the parent process id of the sid and check whether it is part of the job; iterate */ pid = sid; while (pid > 1) { if ((ps = get_proc_stat(pid)) == NULL) { if (errno != ENOENT) { sprintf(log_buffer, "%d: get_proc_stat", pid); log_err(errno, __func__, log_buffer); } return(FALSE); } pid = getsid(ps->ppid); for (ptask = (task *)GET_NEXT(pjob->ji_tasks); ptask != NULL; ptask = (task *)GET_NEXT(ptask->ti_jobtask)) { if (ptask->ti_qs.ti_sid <= 1) continue; if (ptask->ti_qs.ti_sid == pid) { return(TRUE); } } } #endif /* PENABLE_LINUX26_CPUSETS */ return(FALSE); } /* END injob() */ /* * Internal session CPU time decoding routine. * * Accepts a job pointer. Returns the sum of all cpu time * consumed for all tasks executed by the job, in seconds, * adjusted by cputfactor. */ static unsigned long cput_sum( job *pjob) /* I */ { ulong cputime; int nps = 0; int i; proc_stat_t *ps; cputime = 0; if (LOGLEVEL >= 6) { sprintf(log_buffer, "proc_array loop start - jobid = %s", pjob->ji_qs.ji_jobid); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if ((LOGLEVEL >= 6) && (ps == NULL)) { sprintf(log_buffer, "proc_array loop end - nproc=%d, i=%d, ps is null", nproc, i); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } if (!injob(pjob, ps->session)) continue; nps++; cputime += (ps->utime + ps->stime + ps->cutime + ps->cstime); if (LOGLEVEL >= 6) { sprintf(log_buffer, "%s: session=%d pid=%d cputime=%lu (cputfactor=%f)", __func__, ps->session, ps->pid, cputime, cputfactor); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } } /* END for (i) */ if (nps == 0) pjob->ji_flags |= MOM_NO_PROC; else pjob->ji_flags &= ~MOM_NO_PROC; return((unsigned long)((double)cputime * cputfactor)); } /* END cput_sum() */ /* * Return TRUE if any process in the job is over limit for cputime usage. */ static int overcpu_proc( job *pjob, unsigned long limit) /* I */ { ulong cputime; pid_t pid; proc_stat_t *ps; #ifdef PENABLE_LINUX26_CPUSETS struct pidl *pids = NULL; struct pidl *pp; #else struct dirent *dent; #endif /* PENABLE_LINUX26_CPUSETS */ #ifdef PENABLE_LINUX26_CPUSETS /* Instead of collect stats of all processes running on a large SMP system, * collect stats of processes running in and below the cpuset of the job, only. */ pids = get_cpuset_pidlist(pjob->ji_qs.ji_jobid, pids); pp = pids; while (pp != NULL) { pid = pp->pid; pp = pp->next; #else rewinddir(pdir); while ((dent = readdir(pdir)) != NULL) { if (!isdigit(dent->d_name[0])) continue; pid = atoi(dent->d_name); #endif /* PENABLE_LINUX26_CPUSETS */ if ((ps = get_proc_stat(pid)) == NULL) { if (errno != ENOENT) { sprintf(log_buffer, "%d: get_proc_stat", pid); log_err(errno, __func__, log_buffer); } continue; } #ifndef PENABLE_LINUX26_CPUSETS /* if it was in the cpuset, its part of the job, no need to check */ if (!injob(pjob, ps->session)) continue; #endif /* PENABLE_LINUX26_CPUSETS */ /* change from ps->cutime to ps->utime, and ps->cstime to ps->stime */ cputime = (ulong)((double)(ps->utime + ps->stime) * cputfactor); if (cputime > limit) { #ifdef PENABLE_LINUX26_CPUSETS free_pidlist(pids); #endif return(TRUE); } } #ifdef PENABLE_LINUX26_CPUSETS free_pidlist(pids); #endif return(FALSE); } /* END overcpu_proc() */ /* * Internal session virtual memory usage function. * * Returns the total number of bytes of address * space consumed by all current processes within the job. */ static unsigned long long mem_sum( job *pjob) { int i; unsigned long long segadd; proc_stat_t *ps; segadd = 0; if (LOGLEVEL >= 6) { sprintf(log_buffer, "proc_array loop start - jobid = %s", pjob->ji_qs.ji_jobid); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (!injob(pjob, ps->session)) continue; segadd += ps->vsize; if (LOGLEVEL >= 6) { sprintf(log_buffer, "%s: session=%d pid=%d vsize=%llu sum=%llu", __func__, ps->session, ps->pid, ps->vsize, segadd); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } } /* END for (i) */ return(segadd); } /* END mem_sum() */ /* * Internal session memory usage function. * * Returns the total number of bytes of resident memory * consumed by all current processes within the job. */ static unsigned long long resi_sum( job *pjob) { int i; unsigned long long resisize; proc_stat_t *ps; #ifdef USELIBMEMACCT long long w_rss; #endif resisize = 0; if (LOGLEVEL >= 6) { sprintf(log_buffer, "proc_array loop start - jobid = %s", pjob->ji_qs.ji_jobid); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (!injob(pjob, ps->session)) continue; #ifdef USELIBMEMACCT /* Ask memacctd for weighted rss of pid, use this instead of ps->rss */ w_rss = get_memacct_resi(ps->pid); if (w_rss == -1) resisize += ps->rss * pagesize; else resisize += w_rss; if (LOGLEVEL >= 6) { sprintf(log_buffer, "%s: session=%d pid=%d rss=%llu w_rss=%ld sum=%llu", __func__, ps->session, ps->pid, ps->rss * pagesize, w_rss, resisize); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } #else resisize += ps->rss * pagesize; if (LOGLEVEL >= 6) { sprintf(log_buffer, "%s: session=%d pid=%d rss=%llu sum=%llu", __func__, ps->session, ps->pid, ps->rss * pagesize, resisize); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } #endif } /* END for (i) */ return(resisize); } /* END resi_sum() */ /* * Return TRUE if any process in the job is over limit for virtual memory usage. */ static int overmem_proc( job *pjob, /* I */ unsigned long long limit) /* I */ { int i; proc_stat_t *ps; if (LOGLEVEL >= 6) { sprintf(log_buffer, "proc_array loop start - jobid = %s", pjob->ji_qs.ji_jobid); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (!injob(pjob, ps->session)) continue; if (ps->vsize > limit) { return(TRUE); } } /* END for (i) */ return(FALSE); } /* END overmem_proc() */ extern char *msg_momsetlim; /* * Internal error routine */ int error( const char *string, int value) { char *message; assert(string != NULL); assert(*string != '\0'); message = pbse_to_txt(value); assert(message != NULL); assert(*message != '\0'); fprintf(stderr, msg_momsetlim, string, message); fflush(stderr); return(value); } /* END error() */ /* * Establish system-enforced limits for the job. * * Run through the resource list, checking the values for all items * we recognize. * * If set_mode is SET_LIMIT_SET, then also set hard limits for the * system enforced limits (not-polled). * If anything goes wrong with the process, return a PBS error code * and print a message on standard error. A zero-length resource list * is not an error. * * If set_mode is SET_LIMIT_SET the entry conditions are: * 1. MOM has already forked, and we are called from the child. * 2. The child is still running as root. * 3. Standard error is open to the user's file. * * If set_mode is SET_LIMIT_ALTER, we are being called to modify * existing limits. Cannot alter those set by setrlimit (kernel) * because we are the wrong process. */ int mom_set_limits( job *pjob, /* I */ int set_mode) /* SET_LIMIT_SET or SET_LIMIT_ALTER */ { const char *pname = NULL; int retval; unsigned long value; /* place in which to build resource value */ resource *pres; struct rlimit reslim; unsigned long vmem_limit = 0; unsigned long mem_limit = 0; /* NOTE: log_buffer is exported */ if (LOGLEVEL >= 2) { sprintf(log_buffer, "%s(%s,%s) entered", __func__, (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL", (set_mode == SET_LIMIT_SET) ? "set" : "alter"); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); log_buffer[0] = '\0'; } assert(pjob != NULL); assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC); pres = (resource *)GET_NEXT(pjob->ji_wattr[JOB_ATR_resource].at_val.at_list); /* * cycle through all the resource specifications, * setting limits appropriately. */ memset(&reslim, 0, sizeof(reslim)); /* set oom_adj score for the starting job */ /* if immunize mode is set to on, we have to set child score to 0 */ if ( (set_mode == SET_LIMIT_SET) && ( job_oom_score_adjust != 0 || mom_oom_immunize != 0 ) ) { retval = oom_adj(job_oom_score_adjust); if ( LOGLEVEL >= 2 ) { sprintf(log_buffer, "setting oom_adj '%s'", (retval != -1) ? "succeeded" : "failed"); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } }; while (pres != NULL) { if (pres->rs_defin != NULL) pname = pres->rs_defin->rs_name; else pname = NULL; if (LOGLEVEL >= 2) { sprintf(log_buffer, "setting limit for attribute '%s'", (pname != NULL) ? pname : "NULL"); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); log_buffer[0] = '\0'; } assert(pres->rs_defin != NULL); assert(pname != NULL); assert(pname[0] != '\0'); if (!strcmp(pname, "cput")) { if (igncput == FALSE) { /* cpu time - check, if less than pcput use it */ retval = mm_gettime(pres, &value); if (retval != PBSE_NONE) { sprintf(log_buffer, "cput mm_gettime failed in %s", __func__); return(error(pname, retval)); } } } else if (!strcmp(pname, "pcput")) { if (igncput == FALSE) { if (set_mode == SET_LIMIT_SET) { /* process cpu time - set */ retval = mm_gettime(pres, &value); if (retval != PBSE_NONE) { sprintf(log_buffer, "pcput mm_gettime failed in %s", __func__); return(error(pname, retval)); } reslim.rlim_cur = reslim.rlim_max = (unsigned long)((double)value / cputfactor); if (LOGLEVEL >= 2) { sprintf(log_buffer, "setting cpu time limit to %ld for job %s", (long int)reslim.rlim_cur, pjob->ji_qs.ji_jobid); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); log_buffer[0] = '\0'; } /* NOTE: some versions of linux have a bug which causes the parent process to receive a SIGKILL if the child's cpu limit is exceeded */ if (setrlimit(RLIMIT_CPU, &reslim) < 0) { sprintf(log_buffer, "setrlimit for RLIMIT_CPU failed in %s, errno=%d (%s)", __func__, errno, strerror(errno)); return(error("RLIMIT_CPU", PBSE_SYSTEM)); } } /* END if (set_mode == SET_LIMIT_SET) */ } } else if (!strcmp(pname, "file")) { /* set */ if (set_mode == SET_LIMIT_SET) { retval = mm_getsize(pres, &value); if (retval != PBSE_NONE) { sprintf(log_buffer, "mm_getsize() failed for file in %s", __func__); return(error(pname, retval)); } if (value > ULONG_MAX) { if (LOGLEVEL >= 0) { sprintf(log_buffer, "cannot set file limit to %ld for job %s (value too large)", (long int)reslim.rlim_cur, pjob->ji_qs.ji_jobid); log_err(-1, __func__, log_buffer); log_buffer[0] = '\0'; } return(error(pname, PBSE_BADATVAL)); } reslim.rlim_cur = reslim.rlim_max = value; if (setrlimit(RLIMIT_FSIZE, &reslim) < 0) { sprintf(log_buffer, "cannot set file limit to %ld for job %s (setrlimit failed - check default user limits)", (long int)reslim.rlim_max, pjob->ji_qs.ji_jobid); log_err(errno, __func__, log_buffer); log_buffer[0] = '\0'; return(error(pname, PBSE_SYSTEM)); } } } else if (!strcmp(pname, "vmem")) { if (ignvmem == FALSE) { /* check */ retval = mm_getsize(pres, &value); if (retval != PBSE_NONE) { sprintf(log_buffer, "mm_getsize() failed for vmem in %s", __func__); return(error(pname, retval)); } if ((vmem_limit == 0) || (value < vmem_limit)) vmem_limit = value; } } else if (!strcmp(pname, "pvmem")) { if (ignvmem == FALSE) { /* set */ if (set_mode == SET_LIMIT_SET) { retval = mm_getsize(pres, &value); if (retval != PBSE_NONE) { sprintf(log_buffer, "mm_getsize() failed for pvmem in %s", __func__); return(error(pname, retval)); } if (value > ULONG_MAX) { log_buffer[0] = '\0'; sprintf(log_buffer, "invalid value returned by mm_getsize() for pvmem in %s", __func__); return(error(pname, PBSE_BADATVAL)); } if ((vmem_limit == 0) || (value < vmem_limit)) vmem_limit = value; } } } else if ((!strcmp(pname,"mem") && (pjob->ji_numnodes != 1)) || !strcmp(pname,"mppmem")) { /* ignore. If we ever get rid of support for the UNICOS OS then we can remove the ATR_DFLAG_MOM | ATR_DFLAG_ALTRUN flags from mppmem */ } else if ((!strcmp(pname, "mem") && (pjob->ji_numnodes == 1)) || !strcmp(pname, "pmem")) { if (ignmem == FALSE) { /* set */ if (set_mode == SET_LIMIT_SET) { retval = mm_getsize(pres, &value); if (retval != PBSE_NONE) { sprintf(log_buffer, "mm_getsize() failed for mem/pmem in %s", __func__); return(error(pname, retval)); } reslim.rlim_cur = reslim.rlim_max = value; if (setrlimit(RLIMIT_DATA, &reslim) < 0) { sprintf(log_buffer, "cannot set data limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)", (long int)reslim.rlim_max, pjob->ji_qs.ji_jobid, errno, strerror(errno)); return(error("RLIMIT_DATA", PBSE_SYSTEM)); } if (setrlimit(RLIMIT_RSS, &reslim) < 0) { sprintf(log_buffer, "cannot set RSS limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)", (long int)reslim.rlim_max, pjob->ji_qs.ji_jobid, errno, strerror(errno)); return(error("RLIMIT_RSS", PBSE_SYSTEM)); } #ifdef __GATECH /* NOTE: best patch may be to change to 'vmem_limit = value;' */ if (setrlimit(RLIMIT_STACK, &reslim) < 0) { sprintf(log_buffer, "cannot set stack limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)", (long int)reslim.rlim_max, pjob->ji_qs.ji_jobid, errno, strerror(errno)); return(error("RLIMIT_STACK", PBSE_SYSTEM)); } /* set address space */ if (setrlimit(RLIMIT_AS, &reslim) < 0) { sprintf(log_buffer, "cannot set AS limit to %ld for job %s (setrlimit failed w/errno=%d (%s) - check default user limits)", (long int)reslim.rlim_max, pjob->ji_qs.ji_jobid, errno, strerror(errno)); return(error("RLIMIT_AS", PBSE_SYSTEM)); } #endif /* __GATECH */ mem_limit = value; if (getrlimit(RLIMIT_STACK, &reslim) >= 0) { /* NOTE: mem_limit no longer used with UMU patch in place */ mem_limit = value + reslim.rlim_cur; } } } } /* END else if (!strcmp(pname,"mem") && ... */ else if (!strcmp(pname, "walltime")) { /* check */ retval = mm_gettime(pres, &value); if (retval != PBSE_NONE) { sprintf(log_buffer, "mm_gettime() failed for walltime in %s\n", __func__); return(error(pname, retval)); } } else if (!strcmp(pname, "nice")) { /* set nice */ if (set_mode == SET_LIMIT_SET) { errno = 0; if ((nice((int)pres->rs_value.at_val.at_long) == -1) && (errno != 0)) { sprintf(log_buffer, "nice() failed w/errno=%d (%s) in %s\n", errno, strerror(errno), __func__); return(error(pname, PBSE_BADATVAL)); } } } else if (!strcmp(pname, "size")) { /* ignore */ /* NO-OP */ } else if (!strcmp(pname, "prologue")) { } else if (!strcmp(pname, "epilogue")) { } else if ((!strcmp(pname, "mppdepth")) || (!strcmp(pname, "mppnodect")) || (!strcmp(pname, "mppwidth")) || (!strcmp(pname, "mppnppn")) || (!strcmp(pname, "mppnodes")) || (!strcmp(pname, "mpplabels")) || (!strcmp(pname, "mpparch")) || (!strcmp(pname, "mpplabel"))) { /* NO-OP */ } else if ((pres->rs_defin->rs_flags & ATR_DFLAG_RMOMIG) == 0) { /* don't recognize and not marked as ignore by mom */ sprintf(log_buffer, "do not know how to process resource '%s' in %s\n", pname, __func__); return(error(pname, PBSE_UNKRESC)); } pres = (resource *)GET_NEXT(pres->rs_link); } if (set_mode == SET_LIMIT_SET) { /* if either of vmem or pvmem was given, set sys limit to lesser */ if (vmem_limit != 0) { /* Don't make (p)vmem < pmem */ if (mem_limit > vmem_limit) { vmem_limit = mem_limit; } reslim.rlim_cur = reslim.rlim_max = vmem_limit; if ((ignvmem == 0) && (setrlimit(RLIMIT_AS, &reslim) < 0)) { sprintf(log_buffer, "setrlimit() failed setting AS for vmem_limit mod in %s\n", __func__); return(error("RLIMIT_AS", PBSE_SYSTEM)); } /* UMU vmem patch sets RLIMIT_AS rather than RLIMIT_DATA and RLIMIT_STACK */ /* reslim.rlim_cur = reslim.rlim_max = mem_limit; if (setrlimit(RLIMIT_DATA,&reslim) < 0) { sprintf(log_buffer,"setrlimit() failed setting data for vmem_limit mod in %s\n", id); return(error("RLIMIT_DATA",PBSE_SYSTEM)); } if (setrlimit(RLIMIT_STACK,&reslim) < 0) { sprintf(log_buffer,"setrlimit() failed setting stack for vmem_limit mod in %s\n", id); return(error("RLIMIT_STACK",PBSE_SYSTEM)); } */ } } if (LOGLEVEL >= 5) { sprintf(log_buffer, "%s(%s,%s) completed", __func__, (pjob != NULL) ? pjob->ji_qs.ji_jobid : "NULL", (set_mode == SET_LIMIT_SET) ? "set" : "alter"); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); log_buffer[0] = '\0'; } /* SUCCESS */ return(PBSE_NONE); } /* END mom_set_limits() */ /* * State whether MOM main loop has to poll this job to determine if some * limits are being exceeded. * * Sets flag TRUE if polling is necessary, FALSE otherwise. Actual * polling is done using the mom_over_limit machine-dependent function. */ int mom_do_poll( job *pjob) /* I */ { const char *pname; resource *pres; assert(pjob != NULL); if (LOGLEVEL >= 4) { log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, pjob->ji_qs.ji_jobid, "evaluating limits for job"); } assert(pjob != NULL); assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC); pres = (resource *)GET_NEXT( pjob->ji_wattr[JOB_ATR_resource].at_val.at_list); while (pres != NULL) { assert(pres->rs_defin != NULL); pname = pres->rs_defin->rs_name; assert(pname != NULL); assert(*pname != '\0'); if (strcmp(pname, "walltime") == 0 || strcmp(pname, "cput") == 0 || strcmp(pname, "pcput") == 0 || strcmp(pname, "mem") == 0 || strcmp(pname, "pvmem") == 0 || strcmp(pname, "vmem") == 0) { return(TRUE); } pres = (resource *)GET_NEXT(pres->rs_link); } return(FALSE); } /* END mom_do_poll() */ /* * Setup for polling. * * Open kernel device and get namelist info. */ int mom_open_poll(void) { if (LOGLEVEL >= 6) { log_record(PBSEVENT_SYSTEM, 0, __func__, "started"); } pagesize = getpagesize(); proc_array = (proc_stat_t *)calloc(TBL_INC, sizeof(proc_stat_t)); if (proc_array == NULL) { log_err(errno, __func__, "calloc"); return(PBSE_SYSTEM); } max_proc = TBL_INC; return(PBSE_NONE); } /* END mom_open_poll() */ /* * Declare start of polling loop. * * This function caches information about all of processes * on the compute node (pbs_mom calls this function). Each process * in /proc/ is queried by looking at the 'stat' file. Statistics like * CPU usage time, memory consumption, etc. are gathered in the proc_array * list. This list is then used throughout the pbs_mom to get information * about tasks it is monitoring. * * This function is called from the main MOM loop once every "check_poll_interval" * seconds. * * @see get_proc_stat() - child * @see mom_set_use() - Aggregates data collected here * * NOTE: populates global 'proc_array[]' variable. * NOTE: reallocs proc_array[] as needed to accomodate processes. * * @see mom_open_poll() - allocs proc_array table. * @see mom_close_poll() - frees procs_array. * @see setup_program_environment() - parent - called at pbs_mom start * @see main_loop() - parent - called once per iteration * @see mom_set_use() - populate job structure with usage data for local use or to send to mother superior */ int mom_get_sample(void) { proc_stat_t *pi; proc_stat_t *ps; pid_t pid; #ifdef PENABLE_LINUX26_CPUSETS struct pidl *pids = NULL; struct pidl *pp; #else struct dirent *dent; #endif if (proc_array == NULL) mom_open_poll(); nproc = 0; pi = proc_array; if (LOGLEVEL >= 6) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, "proc_array load started"); } #ifdef PENABLE_LINUX26_CPUSETS /* Instead of collect stats of all processes running on a large SMP system, * collect stats of processes running in and below the Torque cpuset, only * This relies on reliable process starters for MPI, which bind their tasks * to the cpuset of the job. */ #ifdef USELIBCPUSET pids = get_cpuset_pidlist(TTORQUECPUSET_BASE, pids); #else pids = get_cpuset_pidlist(TTORQUECPUSET_PATH, pids); #endif pp = pids; while (pp != NULL) { pid = pp->pid; pp = pp->next; #else if (pdir == NULL) { if ((pdir = opendir(procfs)) == NULL) return(PBSE_SYSTEM); } rewinddir(pdir); while ((dent = readdir(pdir)) != NULL) { if (!isdigit(dent->d_name[0])) continue; pid = atoi(dent->d_name); #endif if ((ps = get_proc_stat(pid)) == NULL) { if (errno != ENOENT) { sprintf(log_buffer, "%d: get_proc_stat", pid); log_err(errno, __func__, log_buffer); } continue; } /* nproc++; -- we need to increment AFTER assigning this ps to the proc_array--otherwise we could skip it in for loops */ if ((nproc + 1) >= max_proc) { proc_stat_t *hold; if (LOGLEVEL >= 9) { log_record(PBSEVENT_DEBUG, PBS_EVENTCLASS_SERVER, __func__, "alloc more proc_array"); } max_proc *= 2; hold = (proc_stat_t *)calloc(1, max_proc * sizeof(proc_stat_t)); if (hold == NULL) { log_err(errno, __func__, "unable to realloc space for proc_array sample"); return(PBSE_SYSTEM); } memcpy(hold, proc_array, sizeof(proc_stat_t) * max_proc / 2); free(proc_array); proc_array = hold; } /* END if ((nproc+1) == max_proc) */ pi = &proc_array[nproc++]; memcpy(pi, ps, sizeof(proc_stat_t)); } /* END while (...) != NULL) */ #ifdef PENABLE_LINUX26_CPUSETS free_pidlist(pids); #endif if (LOGLEVEL >= 6) { sprintf(log_buffer, "proc_array loaded - nproc=%d", nproc); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } return(PBSE_NONE); } /* END mom_get_sample() */ /* * Measure job resource usage and compare with its limits. * * If it has exceeded any well-formed polled limit return the limit that * it exceeded. * Otherwise, return PBSE_NONE. log_buffer is populated with failure. */ int mom_over_limit( job *pjob) /* I */ { const char *pname; int retval; unsigned long value; unsigned long num; unsigned long long numll; resource *pres; assert(pjob != NULL); assert(pjob->ji_wattr[JOB_ATR_resource].at_type == ATR_TYPE_RESC); pres = (resource *)GET_NEXT( pjob->ji_wattr[JOB_ATR_resource].at_val.at_list); for (;pres != NULL;pres = (resource *)GET_NEXT(pres->rs_link)) { assert(pres->rs_defin != NULL); pname = pres->rs_defin->rs_name; assert(pname != NULL); assert(*pname != '\0'); if ((igncput == FALSE) && (strcmp(pname, "cput") == 0)) { retval = mm_gettime(pres, &value); if (retval != PBSE_NONE) continue; if ((num = cput_sum(pjob)) > value) { sprintf(log_buffer, "cput %lu exceeded limit %lu", num, value); return(JOB_EXEC_OVERLIMIT_CPUT); } } else if ((igncput == FALSE) && (strcmp(pname, "pcput") == 0)) { retval = mm_gettime(pres, &value); if (retval != PBSE_NONE) continue; if (overcpu_proc(pjob, value)) { sprintf(log_buffer, "pcput exceeded limit %lu", value); return(JOB_EXEC_OVERLIMIT_CPUT); } } else if (strcmp(pname, "vmem") == 0) { retval = mm_getsize(pres, &value); if (retval != PBSE_NONE) continue; if ((ignvmem == 0) && ((numll = mem_sum(pjob)) > value)) { sprintf(log_buffer, "vmem %llu exceeded limit %lu", numll, value); return(JOB_EXEC_OVERLIMIT_MEM); } } else if (strcmp(pname, "pvmem") == 0) { unsigned long long valuell; retval = mm_getsize(pres, &value); if (retval != PBSE_NONE) continue; valuell = (unsigned long long)value; if ((ignvmem == 0) && (overmem_proc(pjob, valuell))) { sprintf(log_buffer, "pvmem exceeded limit %llu", valuell); return(JOB_EXEC_OVERLIMIT_MEM); } } else if (ignwalltime == 0 && strcmp(pname, "walltime") == 0) { /* no need to check walltime on sisters, MS will get it */ if (am_i_mother_superior(*pjob) == false) continue; retval = mm_gettime(pres, &value); if (retval != PBSE_NONE) continue; num = (unsigned long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); if (num > value) { sprintf(log_buffer, "walltime %ld exceeded limit %ld", num, value); return(JOB_EXEC_OVERLIMIT_WT); } } } /* END for (pres) */ #ifdef PENABLE_LINUX26_CPUSETS /* Check memory_pressure */ if (memory_pressure_threshold > 0) { /* * If last recorded memory_pressure is over threshold, increment counter. * If duration is enabled, throw over_limit if counter reaches duration. */ if (pjob->ji_mempressure_curr < memory_pressure_threshold) { pjob->ji_mempressure_cnt = 0; /* reset */ } else { pjob->ji_mempressure_cnt++; /* count */ sprintf(log_buffer, "job %s memory_pressure is over %d for %d (%d) cycles", pjob->ji_qs.ji_jobid, memory_pressure_threshold, pjob->ji_mempressure_cnt, memory_pressure_duration); log_ext(-1, __func__, log_buffer,LOG_ALERT); if (memory_pressure_duration && (pjob->ji_mempressure_cnt >= memory_pressure_duration)) { sprintf(log_buffer, "swap rate due to memory oversubscription is too high"); return(JOB_EXEC_OVERLIMIT_MEM); } } } #endif return(PBSE_NONE); } /* END mom_over_limit() */ /* * Update the job attribute for resources used. * * The first time this function is called for a job, * it sets up resource entries for * each resource that can be reported for this machine. * * Subsequent calls update the resource usage information based on * stats gathered by the mom_get_sample() function. This function * is often called by "im_request()" as a result of POLL_JOB query * from the mother superior. * * @see im_request() - parent - respond to poll_job request from mother superior * @see examine_all_running_jobs() - parent - update local use on mother superior * @see TMomFinalizeJob1() - parent - update serial job immediately at job start * * @return An error code if something goes wrong. */ int mom_set_use( job *pjob) /* I (modified) */ { resource *pres; pbs_attribute *at; resource_def *rd; unsigned long *lp; unsigned long lnum; #ifdef PENABLE_LINUX26_CPUSETS int inum; #endif assert(pjob != NULL); at = &pjob->ji_wattr[JOB_ATR_resc_used]; assert(at->at_type == ATR_TYPE_RESC); #ifdef USESAVEDRESOURCES /* don't update jobs that are marked as recovery */ if (pjob->ji_flags & MOM_JOB_RECOVERY) { return(PBSE_NONE); } #endif /* USESAVEDRESOURCES */ at->at_flags |= ATR_VFLAG_MODIFY; if ((at->at_flags & ATR_VFLAG_SET) == 0) { /* initialize usage structures */ at->at_flags |= ATR_VFLAG_SET; rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_LONG; rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = add_resource_entry(at, rd); pres->rs_value.at_flags |= ATR_VFLAG_SET; pres->rs_value.at_type = ATR_TYPE_SIZE; pres->rs_value.at_val.at_size.atsv_shift = 10; /* KB */ pres->rs_value.at_val.at_size.atsv_units = ATR_SV_BYTESZ; } /* END if ((at->at_flags & ATR_VFLAG_SET) == 0) */ /* get cputime */ rd = find_resc_def(svr_resc_def, "cput", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = (unsigned long *) & pres->rs_value.at_val.at_long; lnum = cput_sum(pjob); *lp = MAX(*lp, lnum); /* get swap */ rd = find_resc_def(svr_resc_def, "vmem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_size.atsv_num; lnum = (mem_sum(pjob) + 1023) >> pres->rs_value.at_val.at_size.atsv_shift; /* as KB */ *lp = MAX(*lp, lnum); /* get walltime */ rd = find_resc_def(svr_resc_def, "walltime", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); /* NOTE: starting jobs can come through here before stime is recorded */ if (pjob->ji_qs.ji_stime == 0) pres->rs_value.at_val.at_long = 0; else pres->rs_value.at_val.at_long = (long)((double)(time_now - pjob->ji_qs.ji_stime) * wallfactor); /* get memory */ rd = find_resc_def(svr_resc_def, "mem", svr_resc_size); assert(rd != NULL); pres = find_resc_entry(at, rd); assert(pres != NULL); lp = &pres->rs_value.at_val.at_size.atsv_num; lnum = (resi_sum(pjob) + 1023) >> pres->rs_value.at_val.at_size.atsv_shift; /* as KB */ *lp = MAX(*lp, lnum); #ifdef PENABLE_LINUX26_CPUSETS /* get memory_pressure */ if (memory_pressure_threshold > 0) { inum = get_cpuset_mempressure(pjob->ji_qs.ji_jobid); /* Store if success */ if (inum != -1) pjob->ji_mempressure_curr = inum; /* Alert if there is pressure */ if (inum > 0) { sprintf(log_buffer, "job %s causes memory_pressure %d", pjob->ji_qs.ji_jobid, inum); log_ext(-1, __func__, log_buffer, LOG_ALERT); } } else { pjob->ji_mempressure_curr = 0; } #endif return(PBSE_NONE); } /* END mom_set_use() */ /** * Kill a task session. * Call with the task pointer and a signal number. * * @return number of tasks signalled (0 = failure) * * @see kill_job() - parent * * NOTE: should support killpg() or killpidtree() - (NYI) * may be required for suspend/resume */ int kill_task( task *ptask, /* I */ int sig, /* I */ int pg) /* I (1=signal process group, 0=signal master process only) */ { int ct = 0; /* num of processes killed */ int ctThisIteration = 0; int ctCleanIterations = 0; int loopCt = 0; int NumProcessesFound = 0; /* number of processes found with session ID */ #ifdef PENABLE_LINUX26_CPUSETS struct pidl *pids = NULL; struct pidl *pp; #else struct dirent *dent; #endif pid_t pid; proc_stat_t *ps; int sesid; pid_t mompid; sesid = ptask->ti_qs.ti_sid; mompid = getpid(); if (LOGLEVEL >= 5) { sprintf(log_buffer, "%s: sending signal %d to task %d, session %d", __func__, sig, ptask->ti_qs.ti_task, sesid); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); } if (sesid <= 1) { if (LOGLEVEL >= 3) { sprintf(log_buffer, "cannot send signal %d to task (no session id)", sig); log_record( PBSEVENT_ERROR, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); } /* FAILURE */ return(0); } do { ctThisIteration = 0; /* NOTE: do not use cached proc-buffer since we need up-to-date info */ #ifdef PENABLE_LINUX26_CPUSETS /* Instead of collecting stats of all processes running on a large SMP system, * collect stats of processes running in and below the Torque cpuset, only * This relies on reliable process starters for MPI, which bind their tasks * to the cpuset of the job. */ #ifdef USELIBCPUSET pids = get_cpuset_pidlist(TTORQUECPUSET_BASE, pids); #else pids = get_cpuset_pidlist(TTORQUECPUSET_PATH, pids); #endif /* USELIBCPUSET */ pp = pids; while (pp != NULL) { pid = pp->pid; pp = pp->next; #else if (pdir == NULL) { if ((pdir = opendir(procfs)) == NULL) return(PBSE_SYSTEM); } /* pdir is global */ rewinddir(pdir); while ((dent = readdir(pdir)) != NULL) { if (!isdigit(dent->d_name[0])) continue; pid = atoi(dent->d_name); #endif /* PENABLE_LINUX26_CPUSETS */ if ((ps = get_proc_stat(pid)) == NULL) { if (errno != ENOENT) { sprintf(log_buffer, "%d: get_proc_stat", pid); log_err(errno, __func__, log_buffer); } continue; } if ((sesid == ps->session) || (ProcIsChild(procfs,pid,ptask->ti_job->ji_qs.ji_jobid) == TRUE)) { NumProcessesFound++; if ((ps->state == 'Z') || (ps->pid == 0)) { /* * Killing a zombie is sure death! Its pid is zero, * which to kill(2) means 'every process in the process * group of the current process'. */ sprintf(log_buffer, "%s: not killing process (pid=%d/state=%c) with sig %d", __func__, ps->pid, ps->state, sig); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); } /* END if ((ps->state == 'Z') || (ps->pid == 0)) */ else { int i = 0; if (ps->pid == mompid) { /* * there is a race condition with newly started jobs that * can be killed before they've established their own * session id. This means the child tasks still have MOM's * session id. We check this to make sure MOM doesn't kill * herself. */ if (LOGLEVEL >= 3) { sprintf(log_buffer, "%s: not killing process %d. Avoid sending signal because child task still has MOM's session id", __func__, ps->pid); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); } if((sig == SIGKILL)||(sig == SIGTERM)) { ++ctThisIteration; //Ultimately this is task that will need to be killed. } continue; } /* END if (ps->pid == mompid) */ if((sig == SIGKILL)||(sig == SIGTERM)) { ++ctThisIteration; //Only count for killing don't count for any other signal. } if (sig == SIGKILL) { struct timespec req; req.tv_sec = 0; req.tv_nsec = 250000000; /* .25 seconds */ /* give the process some time to quit gracefully first (up to .25*20=5 seconds) */ sprintf(log_buffer, "%s: killing pid %d task %d gracefully with sig %d", __func__, ps->pid, ptask->ti_qs.ti_task, SIGTERM); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); if (pg == 0) kill(ps->pid, SIGTERM); else killpg(ps->pid, SIGTERM); for (i = 0;i < 20;i++) { /* check if process is gone */ if ((ps = get_proc_stat(ps->pid)) == NULL) { break; } else { sprintf(log_buffer, "%s: process (pid=%d/state=%c) after sig %d", __func__, ps->pid, ps->state, SIGTERM); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); if (ps->state == 'Z') break; } /* try to kill again */ if (kill(ps->pid, 0) == -1) break; nanosleep(&req, NULL); } /* END for (i = 0) */ } /* END if (sig == SIGKILL) */ else { i = 20; } if (i >= 20) { /* NOTE: handle race-condition where process goes zombie as a result of previous SIGTERM */ /* update proc info from /proc//stat */ if ((ps = get_proc_stat(ps->pid)) != NULL) { if (ps->state == 'Z') { /* * Killing a zombie is sure death! Its pid is zero, * which to kill(2) means 'every process in the process * group of the current process'. */ sprintf(log_buffer, "%s: not killing process (pid=%d/state=%c) with sig %d", __func__, ps->pid, ps->state, sig); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); } /* END if ((ps->state == 'Z') || (ps->pid == 0)) */ else { /* kill process hard */ /* why is this not killing with SIGKILL? */ sprintf(log_buffer, "%s: killing pid %d task %d with sig %d", __func__, ps->pid, ptask->ti_qs.ti_task, sig); log_record( PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); if (pg == 0) kill(ps->pid, sig); else killpg(ps->pid, sig); } } /* END if ((ps = get_proc_stat(ps->pid)) != NULL) */ } /* END if (i >= 20) */ ++ct; } /* END else ((ps->state == 'Z') || (ps->pid == 0)) */ } /* END if (sesid == ps->session) */ } /* END while (...) != NULL) */ #ifdef PENABLE_LINUX26_CPUSETS free_pidlist(pids); pids = NULL; #endif if(ctThisIteration == 0) { ctCleanIterations++; } else { ctCleanIterations=0; } }while((ctCleanIterations <= 5)&&(loopCt++ < 20)); /* NOTE: to fix bad state situations resulting from a hard crash, the logic below should be triggered any time no processes are found (NYI) */ if (IS_ADOPTED_TASK(ptask->ti_qs.ti_task) && (NumProcessesFound == 0)) { /* no process was found, but for an adopted task this is OK (we don't find * out about the adopted task's termination via waitpid()--so we can safely * say that we have "killed" the task, even though the task was killed/died * some other way */ ct++; /* do code to mark task as finished (borrowed from Linux scan_for_terminated())... */ ptask->ti_qs.ti_exitstat = 0; /* assume successful completion */ ptask->ti_qs.ti_status = TI_STATE_EXITED; task_save(ptask); sprintf(log_buffer, "%s: job %s adopted task %d was marked as terminated because task's PID was no longer found, sid=%d", __func__, ptask->ti_job->ji_qs.ji_jobid, ptask->ti_qs.ti_task, ptask->ti_qs.ti_sid); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); } if ((NumProcessesFound == 0) && (ct <= 0)) { /* we can't find any processes belonging to given session, so we can safely say * that we "killed" the task and have TORQUE clean it up */ ct++; /* do code to mark task as finished (borrowed from Linux scan_for_terminated())... */ ptask->ti_qs.ti_exitstat = 0; /* assume successful completion */ ptask->ti_qs.ti_status = TI_STATE_EXITED; task_save(ptask); if (LOGLEVEL >= 5) { sprintf(log_buffer, "%s: could not send signal %d to task %d (session %d)--no process was found with this session ID (marking task as killed)!", __func__, sig, ptask->ti_qs.ti_task, sesid); log_record(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, ptask->ti_job->ji_qs.ji_jobid, log_buffer); } } /* SUCCESS */ return(ct); } /* END kill_task() */ /* * Clean up everything related to polling. */ int mom_close_poll(void) { if (LOGLEVEL >= 6) { log_record(PBSEVENT_SYSTEM, 0, __func__, "entered"); } if (pdir != NULL) { if (closedir(pdir) != 0) { log_err(errno, __func__, "closedir"); return(PBSE_SYSTEM); } pdir = NULL; } if (proc_array != NULL) { free(proc_array); proc_array = NULL; nproc = 0; max_proc = TBL_INC; } return(PBSE_NONE); } /* END mom_close_poll() */ /* * mom_does_checkpoint * * @returns CST values as described in resmon.h. */ int mom_does_checkpoint(void) { return(CST_BLCR); /* Use the BLCR checkpointing system. */ } /* * Checkpoint the job. * * If abort is true, kill it too. */ int mach_checkpoint( task *ptask, /* I */ char *file, /* I */ int abort) /* I */ { return(-1); } /* END mach_checkpoint() */ /* * Restart the job from the checkpoint file. * * Return -1 on error or sid if okay. */ long mach_restart( task *ptask, char *file) { return(-1); } #define dsecs(val) ( (double)(val) ) char *cput_job( pid_t jobid) { int found = 0; int i; double cputime, addtime; proc_stat_t *ps; cputime = 0.0; if (LOGLEVEL >= 6) { sprintf(log_buffer, "proc_array loop start - jobid = %d", jobid); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (jobid != ps->session) continue; found = 1; /* add utime and stime (AKE) */ addtime = dsecs(ps->utime) + dsecs(ps->stime) + dsecs(ps->cutime) + dsecs(ps->cstime); cputime += addtime; DBPRT(("%s: total %.2f pid %d %.2f\n", __func__, cputime, ps->pid, addtime)) } /* END for (i) */ if (!found) { rm_errno = RM_ERR_EXIST; return(NULL); } sprintf(ret_string, "%.2f", cputime * cputfactor); return(ret_string); } /* END cput_job() */ char *cput_proc( pid_t pid) { double cputime; proc_stat_t *ps; cputime = 0.0; if ((ps = get_proc_stat(pid)) == NULL) { if (errno != ENOENT) { sprintf(log_buffer, "%d: get_proc_stat", pid); log_err(errno, __func__, log_buffer); } rm_errno = RM_ERR_SYSTEM; return(NULL); } cputime = dsecs(ps->utime) + dsecs(ps->stime); sprintf(ret_string, "%.2f", cputime * cputfactor); return(ret_string); } /* END cput_proc() */ const char *cput( struct rm_attribute *attrib) { int value; if (attrib == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return(NULL); } if ((value = atoi(attrib->a_value)) == 0) { sprintf(log_buffer, "bad param: %s", attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (momgetattr(NULL)) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (strcmp(attrib->a_qualifier, "session") == 0) { return(cput_job((pid_t)value)); } if (strcmp(attrib->a_qualifier, "proc") == 0) { return(cput_proc((pid_t)value)); } rm_errno = RM_ERR_BADPARAM; return(NULL); } /* END cput() */ char *mem_job( pid_t sid) /* I */ { unsigned long long memsize; int i; proc_stat_t *ps; /* max memsize ??? */ memsize = 0; if (LOGLEVEL >= 6) { sprintf(log_buffer, "proc_array loop start - sid = %d", sid); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (sid != ps->session) continue; memsize += ps->vsize; } /* END for (i) */ if (memsize == 0) { rm_errno = RM_ERR_EXIST; return(NULL); } sprintf(ret_string, "%llukb", memsize >> 10); /* KB */ return(ret_string); } /* END mem_job() */ char *mem_proc( pid_t pid) { proc_stat_t *ps; if ((ps = get_proc_stat(pid)) == NULL) { if (errno != ENOENT) { sprintf(log_buffer, "%d: get_proc_stat", pid); log_err(errno, __func__, log_buffer); } rm_errno = RM_ERR_SYSTEM; return(NULL); } sprintf(ret_string, "%llukb", (unsigned long long)ps->vsize >> 10); /* KB */ return(ret_string); } /* END mem_proc() */ const char *mem( struct rm_attribute *attrib) { int value; if (attrib == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return(NULL); } if ((value = atoi(attrib->a_value)) == 0) { sprintf(log_buffer, "bad param: %s", attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (momgetattr(NULL)) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (strcmp(attrib->a_qualifier, "session") == 0) { return(mem_job((pid_t)value)); } else if (strcmp(attrib->a_qualifier, "proc") == 0) { return(mem_proc((pid_t)value)); } else { rm_errno = RM_ERR_BADPARAM; return(NULL); } return(NULL); } /* END mem() */ static char *resi_job( pid_t jobid) { int i; int found = 0; unsigned long long resisize; proc_stat_t *ps; #ifdef USELIBMEMACCT long long w_rss; #endif resisize = 0; if (LOGLEVEL >= 6) { sprintf(log_buffer, "proc_array loop start - jobid = %d", jobid); log_record(PBSEVENT_DEBUG, 0, __func__, log_buffer); } for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (jobid != ps->session) continue; found = 1; #ifdef USELIBMEMACCT /* Ask memacctd for weighted rss of pid, use this instead of ps->rss */ w_rss = get_memacct_resi(ps->pid); if (w_rss == -1) resisize += ps->rss * pagesize; else resisize += w_rss; #else resisize += ps->rss; #endif } /* END for (i) */ if (found) { /* in KB */ #ifdef USELIBMEMACCT sprintf(ret_string, "%llukb", resisize >> 10); #else sprintf(ret_string, "%llukb", (resisize * (unsigned long long)pagesize) >> 10); #endif return(ret_string); } rm_errno = RM_ERR_EXIST; return(NULL); } /* END resi_job() */ static char *resi_proc( pid_t pid) { proc_stat_t *ps; #ifdef USELIBMEMACCT long long w_rss; #endif if ((ps = get_proc_stat(pid)) == NULL) { if (errno != ENOENT) { sprintf(log_buffer, "%d: get_proc_stat(PIOCPSINFO)", pid); log_err(errno, __func__, log_buffer); } rm_errno = RM_ERR_SYSTEM; return(NULL); } #ifdef USELIBMEMACCT /* Ask memacctd for weighted rss of pid, use this instead of ps->rss */ if ((w_rss = get_memacct_resi(ps->pid)) == -1) sprintf(ret_string, "%llukb", (ps->rss * (unsigned long long)pagesize) >> 10); else sprintf(ret_string, "%ldkb", w_rss >> 10); #else /* in KB */ sprintf(ret_string, "%lukb", ((ulong)ps->rss * (ulong)pagesize) >> 10); #endif return(ret_string); } /* END resi_proc() */ static const char *resi( struct rm_attribute *attrib) { int value; if (attrib == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return(NULL); } if ((value = atoi(attrib->a_value)) == 0) { sprintf(log_buffer, "bad param: %s", attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (momgetattr(NULL)) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (strcmp(attrib->a_qualifier, "session") == 0) { return(resi_job((pid_t)value)); } if (strcmp(attrib->a_qualifier, "proc") == 0) { return(resi_proc((pid_t)value)); } rm_errno = RM_ERR_BADPARAM; return(NULL); } /* END resi() */ const char *sessions( struct rm_attribute *attrib) /* I */ { int nsids = 0; pid_t sid; char *s; #ifdef NUMA_SUPPORT char mom_check_name[PBS_MAXSERVERNAME]; job *pjob; task *ptask; #else proc_stat_t *ps; struct pidl *sids = NULL, *sl = NULL, *sp; int i; #endif if (attrib != NULL) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } ret_string[0] = '\0'; #ifdef NUMA_SUPPORT /* Initialize the node name to check for for this NUMA node */ strcpy(mom_check_name, mom_host); if ((s = strchr(mom_check_name, '.')) != NULL) *s = '\0'; sprintf(mom_check_name + strlen(mom_check_name), "-%d/", numa_index); /* Initialize the return string */ s = ret_string; /* Walk through job list, look for jobs running on this NUMA node */ for (pjob = (job *)GET_NEXT(svr_alljobs); pjob != NULL; pjob = (job *)GET_NEXT(pjob->ji_alljobs)) { if (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, mom_check_name) == NULL) continue; /* Show all tasks registered for this job */ for (ptask = (task *)GET_NEXT(pjob->ji_tasks); ptask != NULL; ptask = (task *)GET_NEXT(ptask->ti_jobtask)) { if (ptask->ti_qs.ti_status != TI_STATE_RUNNING) continue; sid = ptask->ti_qs.ti_sid; if (LOGLEVEL >= 9) { sprintf(log_buffer, "%s[%d]: job %s on %s? sid %d", __func__, nsids, pjob->ji_qs.ji_jobid, mom_check_name, sid); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } checkret(&s, 100); sprintf(s, "%s%d", (ret_string[0] != '\0') ? " " : "", sid); s += strlen(s); nsids++; } /* END for(ptask) */ } /* END for(pjob) */ #else /* Walk through proc_array, store unique session IDs in the pids list */ for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (ps->uid == 0) continue; if ((sid = ps->session) == 0) continue; if (LOGLEVEL >= 9) { sprintf(log_buffer, "%s[%d]: pid %d sid %d", __func__, nsids, ps->pid, sid); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } sp = sids; while (sp) { if (sp->pid == sid) /* found */ break; sp = sp->next; } if (sp) continue; /* not found */ if ((sp = (struct pidl *)calloc(1, sizeof(struct pidl))) == NULL) { log_err(errno, __func__, "no memory"); rm_errno = RM_ERR_SYSTEM; if (sids) free_pidlist(sids); return(NULL); } sp->pid = sid; sp->next = NULL; nsids++; if (sl) sl->next = sp; else sids = sp; sl = sp; } /* END for(i) */ /* * Assemble return string. * Return empty string if no sessions. */ s = ret_string; sp = sids; while (sp) { checkret(&s, 100); if (sp == sids) sprintf(s, "%d", sp->pid); else sprintf(s, " %d", sp->pid); s += strlen(s); sp = sp->next; } /* END while(sp) */ /* Done */ if (sids) free_pidlist(sids); #endif if (LOGLEVEL >= 6) { sprintf(log_buffer, "nsessions=%d", nsids); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } return(ret_string); } const char *nsessions( struct rm_attribute *attrib) { const char *result; const char *ch; int num; if ((result = sessions(attrib)) == NULL) return(result); if (result[0] == '\0') { num = 0; } else { num = 1; for (ch = result;*ch;ch++) if (*ch == ' ') /* count blanks */ num++; } /* END for (ch) */ sprintf(ret_string, "%d", num); return(ret_string); } /* END nsessions() */ const char *pids( struct rm_attribute *attrib) /* I */ { pid_t jobid; proc_stat_t *ps; char *fmt; int i; int num_pids = 0; if (attrib == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return(NULL); } if ((jobid = (pid_t)atoi(attrib->a_value)) == 0) { sprintf(log_buffer, "bad param: %s", attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (momgetattr(NULL)) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (strcmp(attrib->a_qualifier, "session") != 0) { rm_errno = RM_ERR_BADPARAM; return(NULL); } /* Search for members of session */ fmt = ret_string; for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (LOGLEVEL >= 6) { DBPRT(("%s[%d]: pid: %d sid: %d\n", __func__, num_pids, ps->pid, ps->session)) } if (jobid != ps->session) continue; sprintf(fmt, "%d ", ps->pid); fmt += strlen(fmt); num_pids++; } /* END for (i) */ if (num_pids == 0) { rm_errno = RM_ERR_EXIST; return(NULL); } return(ret_string); } /* END pids() */ const char *nusers( struct rm_attribute *attrib) { int j; int nuids = 0; uid_t *uids, *hold; static int maxuid = 200; register uid_t uid; #ifdef NUMA_SUPPORT char mom_check_name[PBS_MAXSERVERNAME], *s; job *pjob; #else int i; proc_stat_t *ps; #endif if (attrib != NULL) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if ((uids = (uid_t *)calloc(maxuid, sizeof(uid_t))) == NULL) { log_err(errno, __func__, "no memory"); rm_errno = RM_ERR_SYSTEM; return(NULL); } #ifdef NUMA_SUPPORT /* Initialize the node name to check for for this NUMA node */ strcpy(mom_check_name, mom_host); if ((s = strchr(mom_check_name, '.')) != NULL) *s = '\0'; sprintf(mom_check_name + strlen(mom_check_name), "-%d/", numa_index); /* Walk through job list, look for jobs running on this NUMA node */ for (pjob = (job *)GET_NEXT(svr_alljobs); pjob != NULL; pjob = (job *)GET_NEXT(pjob->ji_alljobs)) { if (strstr(pjob->ji_wattr[JOB_ATR_exec_host].at_val.at_str, mom_check_name) == NULL) continue; /* Store uid of job owner */ uid = pjob->ji_qs.ji_un.ji_momt.ji_exuid; if (LOGLEVEL >= 9) { sprintf(log_buffer, "%s[%d]: job %s on %s? uid %d", __func__, nuids, pjob->ji_qs.ji_jobid, mom_check_name, uid); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } #else for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if ((uid = ps->uid) == 0) continue; if (LOGLEVEL >= 9) { sprintf(log_buffer, "%s[%d]: pid %d uid %d", __func__, nuids, ps->pid, uid); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } #endif for (j = 0;j < nuids;j++) { if (uids[j] == uid) break; } if (j == nuids) { /* not found */ if (nuids == maxuid) { /* need more space */ maxuid += 100; hold = (uid_t *)realloc(uids, maxuid); if (hold == NULL) { log_err(errno, __func__, "realloc"); rm_errno = RM_ERR_SYSTEM; free(uids); return(NULL); } memset(hold+(maxuid-100), 0, 100*sizeof(uid_t)); if (LOGLEVEL >= 7) { sprintf(log_buffer, "%s[%d]: need more space: %d", __func__, nuids, maxuid); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } hold[nuids++] = uid; /* add uid to list */ uids = hold; } else { uids[nuids++] = uid; /* add uid to list */ } } } /* END for (i) */ sprintf(ret_string, "%d", nuids); free(uids); if (LOGLEVEL >= 6) { sprintf(log_buffer, "nusers=%d", nuids); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } return(ret_string); } /* END nusers() */ const char *totmem( struct rm_attribute *attrib) { proc_mem_t *mm; if (attrib) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if ((mm = get_proc_mem()) == NULL) { log_err(errno, __func__, "get_proc_mem"); rm_errno = RM_ERR_SYSTEM; return(NULL); } if (LOGLEVEL >= 6) { sprintf(log_buffer, "%s: total mem=%llu", __func__, mm->mem_total + mm->swap_total); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } sprintf(ret_string, "%lukb", (ulong)((mm->mem_total >> 10) + (mm->swap_total >> 10))); /* KB */ return(ret_string); } /* END totmem() */ const char *availmem( struct rm_attribute *attrib) { proc_mem_t *mm; if (attrib != NULL) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if ((mm = get_proc_mem()) == NULL) { log_err(errno, __func__, "get_proc_mem"); rm_errno = RM_ERR_SYSTEM; return(NULL); } /* END availmem() */ if (LOGLEVEL >= 6) { sprintf(log_buffer, "%s: free mem=%llu", __func__, mm->mem_free + mm->swap_free); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); } sprintf(ret_string, "%lukb", (ulong)((mm->mem_free >> 10) + (mm->swap_free >> 10))); /* KB */ return(ret_string); } /* END availmem() */ const char *ncpus( struct rm_attribute *attrib) { #ifdef NUMA_SUPPORT /* report the configured ncpus for this numa node */ sprintf(ret_string,"%d",node_boards[numa_index].num_cpus); #else char label[128]; FILE *fp; int procs; if (attrib != NULL) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) { return(NULL); } procs = 0; while (!feof(fp)) { if (fscanf(fp, "%s %*[^\n]%*c", label) == 0) { getc(fp); /* must do something to get to eof */ } else if (strcmp("processor", label) == 0) procs++; } sprintf(ret_string, "%d", procs); system_ncpus = procs; fclose(fp); #endif /* NUMA_SUPPORT */ if (LOGLEVEL >= 6) { sprintf(log_buffer, "ncpus=%s", ret_string); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_NODE, "ncpus", log_buffer); } return(ret_string); } /* END ncpus() */ /* find_file checks for the existence of filename * in the ':' delimited path string * Return TRUE if file exists * FALSE if file not found */ int find_file( char *path, char *filename) { char *ptr1, *ptr2; char buf[RETURN_STRING_SIZE]; int rc; struct stat statBuf; if (path == NULL) { return(FALSE); } if (filename == NULL) { return(FALSE); } memset(buf, 0, RETURN_STRING_SIZE); ptr1 = path; ptr2 = buf; do { *ptr2 = *ptr1; ptr1++; if (*ptr1 == ':' || *ptr1 == '\0') { /* check for the forward slash at the end of the path variable */ if (*ptr2 != '/') { ptr2++; *ptr2 = '/'; } strcat(buf, filename); rc = stat(buf, &statBuf); if (rc == 0) { return(TRUE); } /* Advance the pointer in the path */ ptr1++; /* reset ptr2 to the beginning of buf and get the next directory */ memset(buf, 0, RETURN_STRING_SIZE); ptr2 = buf; } else ptr2++; /* advance ptr2 to the next element in buf */ }while(*ptr1 != '\0'); return(FALSE); } static const char *physmem( struct rm_attribute *attrib) { char tmpBuf[PMEMBUF_SIZE]; char *BPtr; int BSpace; unsigned long long mem; unsigned long long mem_total; FILE *fp; #ifdef NUMA_SUPPORT int i; #endif if (attrib != NULL) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } mem_total = 0; #ifdef NUMA_SUPPORT for (i = 0; i < node_boards[numa_index].num_nodes; i++) #endif /* NUMA_SUPPORT */ { #ifdef NUMA_SUPPORT if (!(fp = fopen(node_boards[numa_index].path_meminfo[i],"r"))) #else if (!(fp = fopen(path_meminfo, "r"))) #endif { rm_errno = RM_ERR_SYSTEM; return(NULL); } BPtr = tmpBuf; BSpace = sizeof(tmpBuf); BPtr[0] = '\0'; while (!feof(fp)) { if (fgets(BPtr, BSpace, fp) == NULL) { break; } BSpace -= strlen(BPtr); BPtr += strlen(BPtr); } fclose(fp); /* FORMAT: '...\nMemTotal: XXX kB\n' */ if ((BPtr = strstr(tmpBuf, "MemTotal:")) != NULL) { BPtr += strlen("MemTotal:"); if (sscanf(BPtr, "%llu", &mem) != 1) { rm_errno = RM_ERR_SYSTEM; return(NULL); } /* value specified in kb */ } else { /* attempt to load first numeric value */ if (sscanf(BPtr, "%*s %llu", &mem) != 1) { rm_errno = RM_ERR_SYSTEM; return(NULL); } /* value specified in bytes */ mem >>= 10; } mem_total += mem; } sprintf(ret_string, "%llukb", mem_total); return(ret_string); } /* END physmem() */ char *size_fs( char *param) { struct statfs fsbuf; if (param[0] != '/') { sprintf(log_buffer, "%s: not full path filesystem name: %s", __func__, param); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (statfs(param, &fsbuf) == -1) { log_err(errno, __func__, "statfs"); rm_errno = RM_ERR_BADPARAM; return(NULL); } #ifdef RPT_BAVAIL #define RPT_STATFS_MEMBER f_bavail #else #define RPT_STATFS_MEMBER f_bfree #endif sprintf(ret_string, "%lukb:%lukb", (ulong)(((double)fsbuf.f_bsize * (double)fsbuf.RPT_STATFS_MEMBER) / 1024.0), (ulong)(((double)fsbuf.f_bsize * (double)fsbuf.f_blocks) / 1024.0)); /* KB */ return(ret_string); } /* END size_fs() */ char *size_file( char *param) { struct stat sbuf; if (param[0] != '/') { sprintf(log_buffer, "%s: not full path filesystem name: %s", __func__, param); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (stat(param, &sbuf) == -1) { log_err(errno, __func__, "stat"); rm_errno = RM_ERR_BADPARAM; return(NULL); } sprintf(ret_string, "%lukb", (unsigned long)sbuf.st_size >> 10); /* KB */ return(ret_string); } /* END size_file() */ const char *size( struct rm_attribute *attrib) { char *param; if (attrib == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return(NULL); } if (momgetattr(NULL)) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } param = attrib->a_value; if (strcmp(attrib->a_qualifier, "file") == 0) { return(size_file(param)); } if (strcmp(attrib->a_qualifier, "fs") == 0) { return(size_fs(param)); } rm_errno = RM_ERR_BADPARAM; return(NULL); } /* END size() */ /* * For a recovering (-p) mom, look through existing tasks in existing * jobs for things that have exited that are not owned by us through a * parent-child relationship. Otherwise we cannot report back to tm * clients when tasks have exited. */ void scan_non_child_tasks(void) { job *pJob; static int first_time = TRUE; DIR *pdir; /* use local pdir to prevent race conditions associated w/global pdir (VPAC) */ pdir = opendir(procfs); for (pJob = (job *)(GET_NEXT(svr_alljobs)); pJob != (job *)NULL;pJob = (job *)(GET_NEXT(pJob->ji_alljobs))) { task *pTask; long job_start_time = 0; long job_session_id = 0; long session_start_time = 0; proc_stat_t *ps = NULL; if(pJob->ji_wattr[JOB_ATR_system_start_time].at_flags&ATR_VFLAG_SET) { job_start_time = pJob->ji_wattr[JOB_ATR_system_start_time].at_val.at_long; } if(pJob->ji_wattr[JOB_ATR_session_id].at_flags&ATR_VFLAG_SET) { job_session_id = pJob->ji_wattr[JOB_ATR_session_id].at_val.at_long; } if((ps = get_proc_stat(job_session_id)) != NULL) { session_start_time = (long)ps->start_time; } for (pTask = (task *)(GET_NEXT(pJob->ji_tasks)); pTask != NULL; pTask = (task *)(GET_NEXT(pTask->ti_jobtask))) { #ifdef PENABLE_LINUX26_CPUSETS struct pidl *pids = NULL; struct pidl *pp; #else struct dirent *dent; #endif pid_t pid; int found; /* * Check for tasks that were exiting when mom went down, set back to * running so we can reprocess them and send the obit */ if ((first_time) && (pTask->ti_qs.ti_sid != 0) && ((pTask->ti_qs.ti_status == TI_STATE_EXITED) || (pTask->ti_qs.ti_status == TI_STATE_DEAD))) { if (LOGLEVEL >= 7) { sprintf(log_buffer, "marking task %d as TI_STATE_RUNNING was %d", pTask->ti_qs.ti_task, pTask->ti_qs.ti_status); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pJob->ji_qs.ji_jobid, log_buffer); } pTask->ti_qs.ti_status = TI_STATE_RUNNING; } /* only check on tasks that we think should still be around */ if (pTask->ti_qs.ti_status != TI_STATE_RUNNING) continue; /* look for processes with this session id */ found = 0; /* NOTE: on linux systems, the session master should have pid == sessionid */ if (kill(pTask->ti_qs.ti_sid, 0) != -1) { if((job_start_time != 0)&& (session_start_time != 0)) { if(job_start_time == session_start_time) { found = 1; } } else { found = 1; } } if(!found) { /* session master cannot be found, look for other pid in session */ #ifdef PENABLE_LINUX26_CPUSETS pids = get_cpuset_pidlist(pJob->ji_qs.ji_jobid, pids); pp = pids; while (pp != NULL) { pid = pp->pid; pp = pp->next; #else if (pdir == NULL) { if ((pdir = opendir(procfs)) == NULL) return; } rewinddir(pdir); while ((dent = readdir(pdir)) != NULL) { if (!isdigit(dent->d_name[0])) continue; pid = atoi(dent->d_name); #endif /* PENABLE_LINUX26_CPUSETS */ if ((ps = get_proc_stat(pid)) == NULL) continue; if (ps->session == pTask->ti_qs.ti_sid) { if(pJob->ji_wattr[JOB_ATR_system_start_time].at_flags&ATR_VFLAG_SET) { proc_stat_t *ts = get_proc_stat(ps->session); if(ts == NULL) continue; if(ts->start_time == (unsigned long)pJob->ji_wattr[JOB_ATR_system_start_time].at_val.at_long) { found = 1; break; } } else { found = 1; break; } } } /* END while ((dent) != NULL) */ #ifdef PENABLE_LINUX26_CPUSETS free_pidlist(pids); #endif } if (!found) { char buf[MAXLINE]; extern int exiting_tasks; sprintf(buf, "found exited session %d for task %d in job %s", pTask->ti_qs.ti_sid, pTask->ti_qs.ti_task, pJob->ji_qs.ji_jobid); log_event(PBSEVENT_JOB, PBS_EVENTCLASS_JOB, __func__, buf); pTask->ti_qs.ti_exitstat = 0; /* actually unknown */ pTask->ti_qs.ti_status = TI_STATE_EXITED; task_save(pTask); #ifdef USESAVEDRESOURCES if (first_time) { pJob->ji_flags |= MOM_JOB_RECOVERY; if (LOGLEVEL >= 7) { sprintf(buf, "marking job as MOM_JOB_RECOVERY for task %d", pTask->ti_qs.ti_task); log_event(PBSEVENT_DEBUG, PBS_EVENTCLASS_JOB, pJob->ji_qs.ji_jobid, buf); } } #endif /* USESAVEDRESOURCES */ exiting_tasks = 1; } } } /* END for (job = GET_NEXT(svr_alljobs)) */ if (pdir != NULL) closedir(pdir); first_time = FALSE; return; } /* END scan_non_child_tasks() */ time_t maxtm; void setmax( const char *dev) { struct stat sb; if (stat(dev, &sb) == -1) { return; } if (maxtm < sb.st_atime) maxtm = sb.st_atime; return; } /* END setmax() */ const char *idletime( struct rm_attribute *attrib) { DIR *dp; struct dirent *de; char ttyname[50]; time_t curtm; if (attrib) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if ((dp = opendir("/dev")) == NULL) { log_err(errno, __func__, "opendir /dev"); rm_errno = RM_ERR_SYSTEM; return(NULL); } maxtm = 0; curtm = time(NULL); setmax("/dev/mouse"); while ((de = readdir(dp)) != NULL) { if (maxtm >= curtm) break; if (strncmp(de->d_name, "tty", 3)) continue; sprintf(ttyname, "/dev/%s", de->d_name); setmax(ttyname); } closedir(dp); sprintf(ret_string, "%ld", (long)MAX(0, curtm - maxtm)); return(ret_string); } /* END idletime() */ static const char *walltime( struct rm_attribute *attrib) { int value, job, found = 0; time_t now, start; proc_stat_t *ps; int i; if (attrib == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return(NULL); } if ((value = atoi(attrib->a_value)) == 0) { sprintf(log_buffer, "bad param: %s", attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (momgetattr(NULL)) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (strcmp(attrib->a_qualifier, "proc") == 0) { job = 0; } else if (strcmp(attrib->a_qualifier, "session") == 0) { job = 1; } else { rm_errno = RM_ERR_BADPARAM; return(NULL); } if ((now = time(NULL)) <= 0) { log_err(errno, __func__, "time"); rm_errno = RM_ERR_SYSTEM; return(NULL); } start = now; for (i = 0;i < nproc;i++) { ps = &proc_array[i]; if (job != 0) { if (value != ps->session) continue; } else { if (value != ps->pid) continue; } found = 1; start = MIN((unsigned)start, ps->start_time); } /* END for (i) */ if (found) { sprintf(ret_string, "%ld", (long)((double)(now - start) * wallfactor)); return(ret_string); } rm_errno = RM_ERR_EXIST; return(NULL); } /* END walltime() */ /* Get the load average for this node */ int get_la( double *rv) /* O */ { FILE *fp; float load; if ((fp = fopen("/proc/loadavg", "r")) == NULL) { rm_errno = RM_ERR_SYSTEM; return(rm_errno); } if (fscanf(fp, "%f", &load) != 1) { log_err(errno, __func__, "fscanf of load in /proc/loadavg"); fclose(fp); rm_errno = RM_ERR_SYSTEM; return(rm_errno); } *rv = (double)load; fclose(fp); return(0); } /* END get_la() */ #ifdef NUMA_SUPPORT /* * Calculate cpu activities for numa nodeboards. * * This is a very preliminary attempt to provide useful load data for NUMA nodeboards. * Instead of a load average, we report the cpu activities of all cpus of a NUMA board. * Calculated numbers range from 0.0 (no CPU activity) to the number of * CPUs of a NUMA board (all CPUs are busy to 100%). * * Note that this is NOT the load average. However, it almost looks the same. * * The activity of a cpu is calculated from the content of /proc/stat like done * by top and related tools. */ void collect_cpuact(void) { FILE *fp; char label[128]; long procs; int cpu_id; int i; unsigned long long usr, nice, sys, idle, wait; unsigned long long totidle, totbusy, prevtot; unsigned long long dtot, dbusy; /* * Allocate cpu_array, if not already done. * Need to figure out number of cpus in the system, first. */ if (cpu_array == NULL) { if ((fp = fopen("/proc/cpuinfo", "r")) == NULL) /* Failure */ return; procs = 0; while (! feof(fp)) { if (fscanf(fp, "%s %*[^\n]%*c", label) == 0) getc(fp); else if (strcmp("processor", label) == 0) procs++; } fclose(fp); system_ncpus = procs; sprintf(log_buffer, "system contains %ld CPUs", system_ncpus); log_record(PBSEVENT_SYSTEM, 0, __func__, log_buffer); if (system_ncpus) { if ((cpu_array = (proc_cpu_t *)calloc(system_ncpus, sizeof(proc_cpu_t))) == NULL) { log_err(errno, __func__, "failed to allocate memory"); return; } } } /* Zero out cpu_array */ memset(cpu_array, 0, system_ncpus * sizeof(proc_cpu_t)); /* Parse CPU counters from /proc/stat */ if ((fp = fopen("/proc/stat", "r")) != NULL) { while (! feof(fp)) { if (fscanf(fp, "%s", label) != 1) /* Format error */ break; if (sscanf(label, "cpu%d", &cpu_id) != 1) /* Line does not report cpu activities */ continue; if (cpu_id >= system_ncpus) /* Ups, more cpus than found in /proc/cpuinfo */ break; if (fscanf(fp, " %llu %llu %llu %llu %llu", &usr, &nice, &sys, &idle, &wait) != 5) /* Format error */ break; cpu_array[cpu_id].idle_total = idle; cpu_array[cpu_id].busy_total = usr + nice + sys + wait; } fclose(fp); } /* END if (fp) */ /* Calculate cpu activity for each nodeboard */ for (i = 0; i < num_node_boards; i++) { /* Sum up cpu counters of relevant CPUs */ totidle = totbusy = 0; hwloc_bitmap_foreach_begin(cpu_id, node_boards[i].cpuset) { totidle += cpu_array[cpu_id].idle_total; totbusy += cpu_array[cpu_id].busy_total; } hwloc_bitmap_foreach_end(); /* If there are counters from a previous call, evaluate */ if ((prevtot = node_boards[i].pstat_idle + node_boards[i].pstat_busy) != 0) { dbusy = totbusy - node_boards[i].pstat_busy; /* diff busy counter sum */ dtot = totbusy + totidle - prevtot; /* diff total counter sum */ node_boards[i].cpuact = (float)(node_boards[i].num_cpus * dbusy / (double)dtot); } else { node_boards[i].cpuact = 0; } /* Remember counter sums */ node_boards[i].pstat_idle = totidle; node_boards[i].pstat_busy = totbusy; } /* END for(i) */ return; } /* END collect_cpuact() */ const char *cpuact( struct rm_attribute *attrib) { if (attrib != NULL) { log_err(-1, __func__, extra_parm); rm_errno = RM_ERR_BADPARAM; return(NULL); } sprintf(ret_string, "%.2f", node_boards[numa_index].cpuact); if (LOGLEVEL >= 6) { sprintf(log_buffer, "cpuact=%s", ret_string); log_record(PBSEVENT_SYSTEM, PBS_EVENTCLASS_NODE, __func__, log_buffer); } return(ret_string); } /* END cpuact() */ #endif u_long gracetime( u_long secs) { time_t now = time((time_t *)NULL); if (secs > (u_long)now) /* time is in the future */ return(secs - now); return(0); } static const char *quota( struct rm_attribute *attrib) { int type; dev_t dirdev; uid_t uid; struct stat sb; struct mntent *me; struct dqblk qi; FILE *m; struct passwd *pw; static const char *type_array[] = { "harddata", "softdata", "currdata", "hardfile", "softfile", "currfile", "timedata", "timefile", NULL }; enum type_name { harddata, softdata, currdata, hardfile, softfile, currfile, timedata, timefile, type_end }; if (attrib == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return(NULL); } if (strcmp(attrib->a_qualifier, "type")) { sprintf(log_buffer, "unknown qualifier %s", attrib->a_qualifier); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } for (type = 0;type < type_end;type++) { if (strcmp(attrib->a_value, type_array[type]) == 0) break; } if (type == type_end) { /* check to see if command is legal */ sprintf(log_buffer, "bad param: %s=%s", attrib->a_qualifier, attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if ((attrib = momgetattr(NULL)) == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return(NULL); } if (strcmp(attrib->a_qualifier, "dir") != 0) { sprintf(log_buffer, "bad param: %s=%s", attrib->a_qualifier, attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return(NULL); } if (attrib->a_value[0] != '/') /* must be absolute path */ { sprintf(log_buffer, "not an absolute path: %s", attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return NULL; } if (stat(attrib->a_value, &sb) == -1) { sprintf(log_buffer, "stat: %s", attrib->a_value); log_err(errno, __func__, log_buffer); rm_errno = RM_ERR_EXIST; return NULL; } dirdev = (dev_t)sb.st_dev; DBPRT(("dir has devnum %d\n", (int)dirdev)) if ((m = setmntent(MOUNTED, "r")) == NULL) { log_err(errno, __func__, "setmntent"); rm_errno = RM_ERR_SYSTEM; return NULL; } while ((me = getmntent(m)) != NULL) { if (strcmp(me->mnt_type, MNTTYPE_IGNORE) == 0) continue; if (stat(me->mnt_dir, &sb) == -1) { sprintf(log_buffer, "stat: %s", me->mnt_dir); log_err(errno, __func__, log_buffer); continue; } if (LOGLEVEL >= 6) DBPRT(("%s\t%s\t%d\n", me->mnt_fsname, me->mnt_dir, (int)dirdev)) if (!memcmp(&sb.st_dev, &dirdev, sizeof(dev_t))) break; } endmntent(m); if (me == NULL) { sprintf(log_buffer, "filesystem %s not found", attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_EXIST; return NULL; } #if defined(MNTOPT_NOQUOTA) if (hasmntopt(me, MNTOPT_NOQUOTA) != NULL) { sprintf(log_buffer, "no quotas on filesystem %s", me->mnt_dir); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_EXIST; return NULL; } #endif /* MNTOPT_NOQUOTA */ if ((attrib = momgetattr(NULL)) == NULL) { log_err(-1, __func__, no_parm); rm_errno = RM_ERR_NOPARAM; return NULL; } if (strcmp(attrib->a_qualifier, "user") != 0) { sprintf(log_buffer, "bad param: %s=%s", attrib->a_qualifier, attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_BADPARAM; return NULL; } if ((uid = (uid_t)atoi(attrib->a_value)) == 0) { if ((pw = getpwnam_ext(attrib->a_value)) == NULL) { sprintf(log_buffer, "user not found: %s", attrib->a_value); log_err(-1, __func__, log_buffer); rm_errno = RM_ERR_EXIST; return NULL; } uid = pw->pw_uid; } if (syscall( SYS_quotactl, QCMD(Q_GETQUOTA, USRQUOTA), me->mnt_fsname, uid, (caddr_t)&qi) == -1) { log_err(errno, __func__, "quotactl"); rm_errno = RM_ERR_SYSTEM; return(NULL); } /* sizes in KB */ switch (type) { case harddata: sprintf(ret_string, "%lukb", (u_long)qi.dqb_bhardlimit >> 10); break; case softdata: sprintf(ret_string, "%lukb", (u_long)qi.dqb_bsoftlimit >> 10); break; case currdata: #if defined(TENABLEQUOTA) #if _LINUX_QUOTA_VERSION < 2 sprintf(ret_string, "%lukb", (u_long)qi.dqb_curblocks >> 10); #else /* _LINUX_QUOTA_VERSION < 2 */ sprintf(ret_string, "%lukb", (u_long)qi.dqb_curspace >> 10); #endif /* _LINUX_QUOTA_VERSION < 2 */ #endif /* TENABLEQUOTA */ break; case hardfile: sprintf(ret_string, "%lu", (u_long)qi.dqb_ihardlimit); break; case softfile: sprintf(ret_string, "%lu", (u_long)qi.dqb_isoftlimit); break; case currfile: sprintf(ret_string, "%lu", (u_long)qi.dqb_curinodes); break; case timedata: sprintf(ret_string, "%lu", gracetime((u_long)qi.dqb_btime)); break; case timefile: sprintf(ret_string, "%lu", gracetime((u_long)qi.dqb_itime)); break; } /* END switch() */ return(ret_string); } /* END quota() */ /* tested for linux 2.4 kernel (not tested on 2.6) */ #define MAX_INTERFACES 10 /*the maximum number of interfaces*/ #define HEADER_STR "%*[^\n]\n%*[^\n]\n" #define INTERFACE_STR "%[^:]:%lu %*d %*d %*d %*d %*d %*d %*d %lu %*d %*d %*d %*d %*d %*d %*d\n" static const char *netload( struct rm_attribute *attrib) { #ifdef NUMA_SUPPORT /* there's no way to determine these numbers for a numa node */ return(NULL); #else FILE *fp; int rc; /*read count*/ char interfaceName[MAX_INTERFACES][32]; unsigned long int bytesRX[MAX_INTERFACES + 1]; unsigned long int bytesTX[MAX_INTERFACES + 1]; int interface = 0; /* int ethNum = 0; */ if ((fp = fopen("/proc/net/dev", "r")) == NULL) { rm_errno = RM_ERR_SYSTEM; return(NULL); } rc = fscanf(fp, HEADER_STR); /*strip off header lines*/ if (rc < 0) { log_err(errno, __func__, "fscanf of header lines in /proc/net/dev"); fclose(fp); rm_errno = RM_ERR_SYSTEM; return(NULL); } /* read in interface stats until we can't */ /* sum all interface stats, excluding 'lo'*/ memset(bytesRX, 0, sizeof(bytesRX)); memset(bytesTX, 0, sizeof(bytesTX)); for (interface = 0;interface < MAX_INTERFACES;interface++) { rc = fscanf(fp, INTERFACE_STR, interfaceName[interface], &bytesRX[interface], &bytesTX[interface]); if (rc != 3) { interface++; /*adjust counter for future decrement*/ break; } if (strcmp(interfaceName[interface], "lo") != 0) /* don't count 'lo' interfaces' stats */ { /* For singling out ethernet interfaces */ /* if (strncmp(interfaceName[interface],"eth",3) == 0) { rc = sscanf(interfaceName[interface],"eth%d", ðNum); } */ bytesRX[MAX_INTERFACES] += bytesRX[interface]; bytesTX[MAX_INTERFACES] += bytesTX[interface]; } } /* END for (interface) */ /* remove lo from interface count */ --interface; fclose(fp); sprintf(ret_string, "%lu", bytesRX[MAX_INTERFACES] + bytesTX[MAX_INTERFACES]); return(ret_string); #endif /* NUMA_SUPPORT */ } /* END netload() */ mbool_t ProcIsChild( char *Dir, /* I */ pid_t PID, /* I */ char *JobID) /* I */ { return(FALSE); } /* END ProcIsChild() */ /* END mom_mach.c */