#!/usr/bin/perl

# $Id: lcg-expiregridmapdir.pl,v 1.6 2011/04/11 12:45:31 maart Exp $

use strict;
use Getopt::Std;

### Some default configuration values
our $opt_e = 48;             # Minimum cert. expiration time, in hours
our $opt_v = 0;              # Enable verbose messages
our $opt_n = 0;              # Dry run: only report what would be done
our $opt_u = 80;             # Maximum pool account usage perc. per VO
our $opt_p = 0;              # Preserve accounts with running processes
our $opt_a = 0;              # Wipe all files, including standard dot files
our $opt_X = 0;              # Expert mode: disable some sanity checks

our $opt_d = $ENV{GRIDMAPDIR};

if (!$opt_d) {
    open(CONF, '/etc/sysconfig/lcg-expiregridmapdir');
    while (<CONF>) {
	if (/^GRIDMAPDIR=(\S*)\s*$/) {
	    $opt_d = "$1";
	}
    }
    close(CONF);

    if (!$opt_d) {
	$opt_d = '/etc/grid-security/gridmapdir';
    }
}

### Get command line parameters

my $optchars = 'adenpuvX';
my %options;
getopt('deu', \%options);

for my $key (keys(%options)) {
    die "$0: unrecognized option '-$key'\n" if index($optchars, $key) < 0;

    my $arg = $options{$key};

    die "$0: bad argument for option '-$key': '$arg'\n" if $arg =~ /^-/;

    eval "\$opt_$key = \$arg";
}

unless ($opt_X) {
    die "$0: unsafe expiration time: '$opt_e'\n" if $opt_e < 48;
    die "$0: unsafe usage threshold: '$opt_u'\n" if $opt_u < 80;
}

my $all = $opt_a;
my $gmd = $opt_d;
my $dryrun = $opt_n;
my $preserve = $opt_p;
my $verbose = $opt_v;
my $thr = $opt_u / 100;

my $globus = $ENV{GLOBUS_LOCATION};

if (!$globus) {
    open(CONF, '/etc/sysconfig/globus');
    while (<CONF>) {
	if (/^GLOBUS_LOCATION=(\S*)\s*$/) {
	    $globus = $1;
	    last;
	}
    }
    close(CONF);

    if (!$globus) {
	$globus = '/opt/globus';
    }
}

my $cream_sandbox_dir = "/var/cream_sandbox";

open(CONF, "/etc/glite-ce-cream/cream-config.xml");
while (<CONF>) {
    if (/"CREAM_SANDBOX_DIR"\s+value\s*=\s*"([^"\s]+)/) {
	$cream_sandbox_dir = $1;
	last;
    }
}
close(CONF);

sub t($)
{
    my $t = shift;
    my ($sec, $min, $hour, $day, $mon, $year);
    ($sec, $min, $hour, $day, $mon, $year) = localtime($t);
    $year += 1900;
    $mon++;
    return sprintf "%d-%02d-%02d %02d:%02d:%02d",
	$year, $mon, $day, $hour, $min, $sec;
}

sub cleanup($)
{
    my $user = shift;

    return unless $user =~ /\d+$/;	# just to be sure

    my @pw = getpwnam($user);

    if (!@pw) {
	warn "$0: cannot getpwnam('$user')\n";
	return;
    }

    my ($uid, $gid, $home) = ($pw[2], $pw[3], $pw[7]);

    #
    # kill stale processes
    #

    my $prev = 0;

    #
    # first try to prevent the creation of new processes
    #

    unless ($dryrun) {
	for (my $i = 0; $i < 10; $i++) {
	    open (PS, "ps -u $user |");
	    my @pids = grep { s/^\s*(\d+).*/$1/ } <PS>;
	    my $n = kill 'STOP', @pids;
	    close(PS);
	    last if $n == $prev;
	    $prev = $n;
	}
    }

    #
    # now really kill them
    #

    for (my $i = 0; $i < 10; $i++) {
	my $n = 0;
	open (PS, "ps -u $user -f -www |");
	my @procs = <PS>;
	my @pids = map {
	    (my $x = $_) =~ s/^\S+\s+(\d+).*/$1/;
	    chomp($x);
	    $x;
	} @procs[1 .. $#procs];

	unless ($dryrun) {
	    $n = kill 'KILL', @pids;
	}
	close(PS);

	if (($n > 0 || $dryrun) && $verbose) {
	    print "\t" . ('-' x 70) . "\n" if $i == 0;
	    for (@procs) {
		print "\t|$_";
	    }
	    print "\t" . ('-' x 70) . "\n";
	}

	last if $n <= 0;
    }

    #
    # remove stale files and directories;
    # we cannot set the effective UID to that of the user,
    # because under the home directory there may be directories
    # created through "glexec" and therefore owned by other grid accounts
    #

    my $group = getgrgid($gid) || $gid;

    for my $location (
	    $home,
	    "$globus/tmp",
	    "$globus/tmp/gram_job_state",
	    "$cream_sandbox_dir/$group"
	) {
	opendir(DIR, $location);
	for my $entry (readdir(DIR)) {
	    my $path = "$location/$entry";
	    my @s = lstat($path);
	    next unless ($s[4] == $uid or $location eq $home);

	    next if $s[4] < 100;	# just to be sure

	    next if $entry =~ m{^\.\.?$};

	    unless ($all) {
		next if $entry =~ m{^\.bash(_logout|_profile|rc)$};
		next if $entry =~ m{^\.ssh$};
	    }

	    print "\t[$user] delete '$path'\n" if $verbose;

	    next if $dryrun;

	    if (-d $path) {
		#
		# failsafe against weird characters...
		# do not use "-f" option, else errors go unreported
		#

		$ENV{VICTIM} = $path;
		system('rm -r "$VICTIM" < /dev/null');
	    } else {
		unlink($path) || warn "$0: cannot unlink '$path': $!\n";
	    }
	}
	closedir(DIR);
    }
}

chdir("$gmd") or die "$0: cannot chdir '$gmd': $!\n";

print scalar localtime() . "\n" if $verbose;

opendir(DIR, ".");
my @files = grep(!/^\.\.?$/, readdir(DIR));
closedir(DIR);

my (%cert, %inode, %mtime, %total, %inuse);

for my $fname (@files) {
    my @s = stat($fname);
    my $nlink = @s[3];
    my $vo;

    if ($fname !~ /%/) {
	next unless ($vo = $fname) =~ s/\d+$//;
	$total{$vo}++;
    }

    next unless $nlink > 1;
    my $inode = @s[1];
    my $mtime = @s[9];

    if ($fname =~ /%/) {
	$cert{$inode} = $fname;
    } else {
	$inode{$fname} = $inode;
	$mtime{$fname} = $mtime;
	push(@{$inuse{$vo}}, $fname);
    }
}

for my $vo (keys %inuse) {
    my $count = scalar(@{$inuse{$vo}});
    my $ratio = $count / $total{$vo};

    printf("VO $vo: inuse / total = $count / $total{$vo} = %.2f, thr = $thr\n",
	$ratio) if $verbose;

    next if $ratio <= $thr;

    my @eldest = sort { $mtime{$a} <=> $mtime{$b} } @{$inuse{$vo}};

    if ($verbose) {
	for (@eldest) {
	    print "\t" . &t($mtime{$_}) . " $_\n";
	}
    }

    for my $fname (@eldest) {
	#
	# do not delete pool accounts that are too young...
	#

	last if $mtime{$fname} > time() - $opt_e * 3600;

	if ($preserve) {
	    #
	    # do not delete pool accounts that have processes
	    #

	    open (PS, "ps -u $fname |");
	    my @pids = grep { s/^\s*(\d+).*/$1/ } <PS>;
	    close(PS);

	    if ($#pids >= 0) {
		print "\t==> $fname: processes found\n" if $verbose;
		next;
	    }
	}

	my $inode = $inode{$fname};
	my $cert = $cert{$inode};
	my $tmp = "$cert.tmp";

	unless ($dryrun) {
	    my $old_mtime = $mtime{$fname};
	    my @s = stat($fname);

	    if (@s[9] != $old_mtime) {
		#
		# the pool account just got reactivated --> skip
		#
		print "\t==> $fname: keeping for '$cert'\n" if $verbose;
		next;
	    }

	    unless (rename($cert, $tmp)) {
		warn "$0: cannot rename '$cert' to '$tmp': $!\n";
		next;
	    }

	    @s = stat($fname);

	    if (@s[9] != $old_mtime) {
		#
		# uh-oh: the pool account just got reactivated --> undo!
		#
		unless (rename($tmp, $cert)) {
		    warn "$0: cannot rename '$tmp' to '$cert': $!\n";
		}
		print "\t==> $fname: rescued for '$cert'\n" if $verbose;
		next;
	    }
	}

	#
	# first clean up resources tied up by the pool account
	#

	&cleanup($fname);

	unless ($dryrun) {
	    #
	    # finally free the pool account
	    #

	    unlink($tmp) || warn "$0: cannot unlink '$tmp': $!\n";
	}

	print "\t--> $fname: unlink '$cert'\n" if $verbose;

	last if --$count / $total{$vo} <= $thr;
    }
}
