#!/usr/bin/perl # ***************************************************************************** # # Copyright 2011 Zuse Institute Berlin # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Please send comments to kallies@zib.de # # ***************************************************************************** # Purpose: - called from /etc/init.d/pbs_mom during start actions. # - creates /var/spool/torque/mom_priv/mom.layout # - creates/modifies /dev/cpuset/torque # Prereq: - hwloc >= 1.1, http://www.open-mpi.org/projects/hwloc/ # - Sys::Hwloc >= 0.09, http://search.cpan.org/~bka/ # Install: Install this script on each UV rack # /opt/torque/Scripts/mom_gencfg root:root -rwxr-xr-x # Config: Set MOM_GENCFG=/opt/torque/Scripts/mom_gencfg # in /etc/init.d/pbs_mom for UV, execute $MOM_GENCFG before # starting the pbs_mom daemon. # MOM_GENCFG can be overridden in /etc/sysconfig/pbs_mom. # ***************************************************************************** # $Id: mom_gencfg,v 1.1.2.1 2011/01/17 10:12:46 acountin Exp $ # ***************************************************************************** # # *** Instructions for use *** # # 1. Install hwloc - see contrib/hwloc_install.sh. This should already be done since # TORQUE needs hwloc for its cpuset implementation starting in 4.0 # 2. Install Sys::Hwloc from CPAN # 3. Set $PBS_HOME to the proper value if not already set # 4. Update the variables in the section 'Config Definitions' Especially update firstNodeId # and nodesPerBoard if desired. # firstNodeId should be set above 0 if you have a root cpuset that you wish to exclude # nodesPerBoard is the number of numa nodes per board. Each node is defined in the # directory /sys/devices/system/node, in a subdirectory node # 5. Backup your current file, just in case a variable is set incorrectly or neglected # 6. Run this script and enjoy the layout file # # use strict; use lib qw( /usr/lib/perl5 /usr/lib/perl5/site_perl ); use Sys::Hostname; use File::Basename; use Getopt::Long qw(:config no_ignore_case); use autouse 'Pod::Usage' => qw(pod2usage); use Sys::Hwloc 0.09; my $progName = basename($0); my $hostName = hostname(); $SIG{__DIE__} = \&xDie; # ============================================================================== # Setup needed before init # ============================================================================== BEGIN: { die "This script needs at least hwloc-1.1\n" unless HWLOC_XSAPI_VERSION() >= 0x00010100; } # ============================================================================== # Config definitions # ============================================================================== my $hostNames = undef; # hostname pattern to be run on, undef to skip test my $cpusetFsName = '/dev/cpuset'; # the name of the cpuset file system my $cpusetBaseName = '/torque'; # the name of the parent cpuset of a job's cpuset my $mkdirCmd = '/bin/mkdir'; # the path to the mkdir command my $catCmd = '/bin/cat'; # the path to the cat command my $echoCmd = '/bin/echo'; # the path to the echo command my $momCfgDir = 'mom_priv'; # the directory where MOM configs are stored my $momLayoutFile = 'mom.layout'; # the name of the MOM layout file my $firstNodeId = 0; # ID of 1st NUMA node to be used by Torque (start with 0) my $lastNodeId = undef; # ID of last NUMA node to be used (undef means last available) my $nodesPerBoard = 1; # number of NUMA nodes per nodeboard my %cpusetConf = ( cpus => undef, # undef means auto-generate mems => undef, # undef means auto-generate cpu_exclusive => 1, # mem_exclusive => 1, # ); my %options = ( -doLayout => 1, # generate mom.layout -withCpus => 1, # include cpus in mom.layout -withMems => 1, # include mems in mom.layout -doCpuset => 1, # generate/modify /torque cpuset -withSmt => 1, # include logical processors running on the same core -verbose => undef, # be verbose to STDERR -dryRun => undef, # no actions, just tell what would be done ); # ============================================================================== # Command line options # ============================================================================== GetOptions( "layout!" => \$options{-doLayout}, "cpus!" => \$options{-withCpus}, "mems!" => \$options{-withMems}, "smt!" => \$options{-withSmt}, "cpuset!" => \$options{-doCpuset}, "dry-run!" => \$options{-dryRun}, "verbose!" => \$options{-verbose}, "help|?" => sub { usage(0) }, "man" => sub { manPage() }, ) or usage(2); if($options{-dryRun}) { $options{-verbose} = 1 unless defined $options{-verbose}; xDebug(">>> DryRunDryRunDryRunDryRunDryRun <<<"); } # ============================================================================== # Quick exit if not wanted on this host, or if no work to do # ============================================================================== #if(defined $hostNames) { # unless($hostName =~ /$hostNames/) { # xDebug("--- Don't run on $hostName ---"); # exit 0; # } #} exit 0 unless ($options{-doLayout} || $options{-doCpuset}); # ============================================================================== # See if PBS_HOME is set, and if $PBS_HOME/mom_priv exists. # If not, we are probably not called correctly, thus die. # See if cpusets are configured. If not, die. # ============================================================================== die "\$PBS_HOME not set\n" unless (exists $ENV{PBS_HOME} && $ENV{PBS_HOME}); die "PBS_HOME=$ENV{PBS_HOME} does not exist\n" unless -d $ENV{PBS_HOME}; $momCfgDir = "$ENV{PBS_HOME}/${momCfgDir}"; die "MOM config dir $momCfgDir does not exist\n" unless -d $momCfgDir; $momLayoutFile = "${momCfgDir}/${momLayoutFile}"; die "this system does not support cpusets\n" unless -d $cpusetFsName; # ============================================================================== # Figure out system topology, collect wanted node objects # ============================================================================== my $topology = Sys::Hwloc::Topology->init; die "Failed to init topology\n" unless defined $topology; $topology->set_flags(HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM); die("Failed to load topology\n") if $topology->load; # ============================================================================== # Collect nodesets of wanted NUMA nodes per nodeBoard # ============================================================================== my @nodeBoards = (); my $nodeObj = undef; my $nNodes = 0; while($nodeObj = $topology->get_next_obj_by_type(HWLOC_OBJ_NODE, $nodeObj)) { my $nodeId = $nodeObj->logical_index; next if $nodeId < $firstNodeId; last if (defined $lastNodeId && $nodeId > $lastNodeId); if($nNodes) { $nodeBoards[$#nodeBoards]->{nodeset}->or($nodeObj->nodeset); } else { push @nodeBoards, { cpuset => Sys::Hwloc::Bitmap->new, nodeset => $nodeObj->nodeset->dup, }; } $nNodes++; $nNodes = 0 if $nNodes >= $nodesPerBoard; } # ============================================================================== # Assemble cpusets per nodeBoard # ============================================================================== foreach my $nodeBoard (@nodeBoards) { $topology->cpuset_from_nodeset_strict($nodeBoard->{cpuset}, $nodeBoard->{nodeset}); next if $options{-withSmt}; my $core = undef; while($core = $topology->get_next_obj_inside_cpuset_by_type($nodeBoard->{cpuset}, HWLOC_OBJ_CORE, $core)) { my $j = 1; while (my $pu = $topology->get_obj_inside_cpuset_by_type($core->cpuset, HWLOC_OBJ_PU, $j++)) { $nodeBoard->{cpuset}->andnot($pu->cpuset); } } } # ============================================================================== # Generate mom.layout # ============================================================================== if($options{-doLayout}) { xDebug("--- Generating $momLayoutFile ---"); if(! $options{-dryRun}) { open(FILE, "> $momLayoutFile") or die "failed to open $momLayoutFile: $!\n"; } foreach my $nodeBoard (@nodeBoards) { my $line = sprintf("nodes=%s", $nodeBoard->{nodeset}->sprintf_list); $line .= sprintf(" cpus=%s", $nodeBoard->{cpuset}->sprintf_list) if $options{-withCpus}; $line .= sprintf(" mems=%s", $nodeBoard->{nodeset}->sprintf_list) if $options{-withMems}; xDebug(" $line"); print FILE "$line\n" unless $options{-dryRun}; } close(FILE) unless $options{-dryRun}; } # ============================================================================== # Create/modify torque cpuset # ============================================================================== if($options{-doCpuset}) { # Create it if it is not there my $cpusetPath = "${cpusetFsName}${cpusetBaseName}"; if(! -d $cpusetPath) { xDebug("--- Creating $cpusetPath ---"); my $rc = execCmd($mkdirCmd,1,$cpusetPath); die "Failed to create $cpusetPath\n" unless defined $rc; } # Read content xDebug("--- Reading $cpusetPath ---"); my $cpusetData = readCpuset($cpusetPath); die "Failed to read $cpusetPath\n" unless defined $cpusetData; # Assemble changes my %cpusetMod = (); foreach my $key (keys %cpusetConf) { next unless exists $cpusetData->{$key}; my $val = $cpusetConf{$key}; CASE: { $key eq 'cpus' && do { if(! defined $val) { my $cpuset = Sys::Hwloc::Bitmap->new; foreach my $nodeBoard (@nodeBoards) { $cpuset->or($nodeBoard->{cpuset}); } $val = $cpuset->sprintf_list; $cpuset->free; } last CASE; }; $key eq 'mems' && do { if(! defined $val) { my $nodeset = Sys::Hwloc::Bitmap->new; foreach my $nodeBoard (@nodeBoards) { $nodeset->or($nodeBoard->{nodeset}); } $val = $nodeset->sprintf_list; $nodeset->free; } last CASE; }; } next unless defined $val; if( (! defined $cpusetData->{$key}) || (defined $cpusetData->{$key} && $cpusetData->{$key} ne $val) ) { $cpusetMod{$key} = $val; } } # Write changes, if any. Don't abort on error, but warn if changes not done if(%cpusetMod) { xDebug("--- Modifying $cpusetPath ---"); if($options{-dryRun}) { while(my ($key, $val) = each %cpusetMod) { xDebug(sprintf(" = cpuset %s: %-25s %s", $cpusetPath, $key, $val)); } } else { while(my ($key, $val) = each %cpusetMod) { my $out = execCmd($echoCmd, 0, "$val > ${cpusetPath}/$key"); } if($options{-verbose}) { $cpusetData = readCpuset($cpusetPath); die "Failed to read $cpusetPath\n" unless defined $cpusetData; while(my ($key, $val) = each %cpusetMod) { xDebug(sprintf(" %s cpuset %s: %-25s %s", $val eq $cpusetData->{$key} ? '=' : '-', $cpusetPath, $key, $val)); } } } } } # ============================================================================== # All done # ============================================================================== $topology->destroy; exit 0; # ############################################################################# # ============================================================================== # Read cpuset data into a hash, return 0 on error, 1 on success # ============================================================================== sub readCpuset { my $cpusetPath = shift; my $cpusetData = {}; # Check if cpuset exists unless(-d $cpusetPath) { xDebug("ERROR: Cpuset $cpusetPath does not exist."); return undef; } # Read content of cpuset foreach my $key (qw( cpu_exclusive cpus mem_exclusive mem_hardwall memory_migrate memory_pressure memory_spread_page memory_spread_slab mems notify_on_release sched_load_balance sched_relax_domain_level )) { my $f = "${cpusetPath}/$key"; next unless -e $f; my $rc = execCmd($catCmd,0,$f); return undef unless defined $rc; # Command failed my $val = undef; if(@{$rc}) { CASE: { $key eq 'tasks' && do { $val = join(",", @{$rc}); last CASE }; $val = $rc->[0]; } } xDebug(sprintf(" cpuset %s: %-25s %s", $cpusetPath, $key, defined $val ? $val : "NO DATA")); $cpusetData->{$key} = $val; } return $cpusetData; } # ============================================================================== # Execute a command with args. # Returns arrayref with chomped output on success. # On command failure, print error msg and return undef. # ============================================================================== sub execCmd { my $cmdBase = shift; my $verbose = shift; my @cmdArgs = @_; if(! $cmdBase) { xDebug("ERROR execCmd: need \$cmdBase."); return undef; } # -- # Check if cmdBase is executable # -- if(! -x $cmdBase) { xDebug("ERROR: File \"$cmdBase\" does not exist or is not executable."); return undef; } # -- # Execute # -- my $cmd = $cmdBase; $cmd .= (" " . join(" ", @cmdArgs)) if @cmdArgs; xDebug(" About to execute \"$cmd\"") if $verbose; open(CMD, "$cmd 2>&1 |") or do { xDebug("ERROR: Failed to execute \"$cmd\": $!"); return undef; }; my @cmdOut = (); chomp @cmdOut; close(CMD); my $rc = $? >> 8; if($rc) { xDebug("ERROR: Command \"$cmd\" returned rc = $rc"); if(@cmdOut) { xDebug(join("\n", map { " $_" } grep { /\S/ } $#cmdOut < 3 ? @cmdOut : (@cmdOut[0..2], "..."))); } return undef; } # -- # Return output # -- return \@cmdOut; } # ============================================================================== # Usage message # ============================================================================== sub usage { my $code = shift || 0; pod2usage( -verbose => 0, -exitval => "NOEXIT", ); exit $code; } # ============================================================================== # Man page # ============================================================================== sub manPage { if ($< == 0) { # Cannot invoke perldoc as root my $id = eval { getpwnam("nobody") }; $id = eval { getpwnam("nouser") } unless defined $id; $id = -2 unless defined $id; $< = $id; } $> = $<; # Disengage setuid $ENV{PATH} = "/bin:/usr/bin"; # Untaint PATH delete @ENV{ 'IFS', 'CDPATH', 'ENV', 'BASH_ENV' }; if ($0 =~ /^([-\/\w\.]+)$/) { $0 = $1; # Untaint $0 } else { die "Illegal characters were found in \$0 ($0)\n"; } pod2usage( -verbose => 2, -exitval => 0, ); } # ============================================================================== # Verbose printing # ============================================================================== sub xDebug { return unless $options{-verbose}; my $msg = join("", @_); if($msg) { foreach(split("\n", $msg)) { print STDERR "$progName - $_\n" } } else { print STDERR "$progName - something to debug\n"; } } sub xDie { die "$progName - ", @_; } __END__ =head1 NAME mom_gencfg - Create mom.layout and /dev/cpuset/torque, designed to be called from /etc/init.d/pbs_mom =head1 SYNOPSIS mom_gencfg --help|-?|--man mom_gencfg -(no)layout -(no)cpus -(no)mems -(no)cpuset -(no)smt -(no)dry-run -(no)verbose =head1 DESCRIPTION This script creates /var/spool/torque/mom_priv/mom.layout and creates/modifies /dev/cpuset/torque for a pbs_mom that is compiled with --enable-numa-support. The basic configuration like number and offset of NUMA node IDs per nodeboard, cpuset settings, and defaults of command line options is hardcoded in the script. The script checks if I is set in the environment. Usually this should point to /var/spool/torque. =head1 OPTIONS =over 4 =item B<-(no)layout> Create the mom.layout file or not. =item B<-(no)cpus> mom.layout contains cpu IDs per nodeboard or not. =item B<-(no)mems> mom.layout contains memory node IDs per nodeboard or not. =item B<-(no)cpuset> Create/modify /dev/cpuset/torque or not. =item B<-(no)smt> The I entry in mom.layout and in /dev/cpuset/torque contain additional logical processors running on the same core or not. =item B<-(no)dry-run> If B<-dry-run> is given, show what would have been done. Switches B<-verbose> on, unless B<-noverbose> was given. =item B<-(no)verbose> Verbose printing to STDERR. =item B<-man> Prints this man page. =item B<-help|-?> Prints synopsis. =back =head1 AUTHOR Bernd Kallies, Ekallies@zib.deE =head1 COPYRIGHT AND LICENSE Copyright (C) 2011 Zuse Institute Berlin This library is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation. =cut