#!/usr/bin/perl -w
#
# process-control -- Process Usenet control messages to maintain a database.
#
# Written by Russ Allbery <eagle@eyrie.org>
# Based *very* heavily on controlchan by Marco d'Itri.
# Portions based on pgpverify by David Lawrence.
#
# Due to its history, this file is derived from several other files and has
# multiple (thankfully consistent) copyrights and license statements.
#
#
# Copyright 2003, 2007-2008, 2010-2011, 2017-2019
#    Russ Allbery <eagle@eyrie.org>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
#
#
# Copyright 2001 Marco d'Itri <md@linux.it>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
#
#
# Copyright (c) 1996 UUNET Technologies, Inc.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# 3. All advertising materials mentioning features or use of this software
#    must display the following acknowledgement:
#
#    This product includes software developed by UUNET Technologies, Inc.
#
# 4. The name of UUNET Technologies ("UUNET") may not be used to endorse
#    or promote products derived from this software without specific prior
#    written permission.
#
# THIS SOFTWARE IS PROVIDED BY UUNET "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
# EVENT SHALL UUNET BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# SPDX-License-Identifier: MIT AND BSD-2-Clause AND BSD-4-Clause

##############################################################################
# Site configuration
##############################################################################

# Path to the active newsgroup database.
our $ACTIVE = '/srv/control/active.db';

# Path to the root of the control message archive.
our $ARCHIVE = '/srv/control/archive';

# Path to the control.ctl file to use.
our $CONTROL = '/srv/control/control.ctl';

# The full path to the gpgv binary.
our $GPGV = '/usr/bin/gpg1';

# The full path to the GnuPG keyring.
our $KEYRING = '/srv/control/keyring/pubring.gpg';

# The log file into which to record actions taken.  The current year and
# month will be appended.
our $LOGBASE = '/srv/control/logs/log';
our $LOG;

# The temporary directory to use for PGP verification.
our $TMPDIR = '/srv/control/tmp';

##############################################################################
# Modules and declarations
##############################################################################

require 5.006;

use strict;
use subs qw(log);

use Compress::Zlib qw(gzopen);
use DB_File ();
use Fcntl qw(LOCK_EX);
use IO::Handle ();
use POSIX qw(strftime);

##############################################################################
# Logging
##############################################################################

# Open the log file.
sub openlog {
    $LOG = $LOGBASE . '.' . strftime ('%Y-%m', gmtime);
    open (LOG, ">> $LOG") or die "$0: can't open $LOG: $!\n";
}

# Log a message to the log file.
sub log {
    my $log = $LOGBASE . '.' . strftime ('%Y-%m', gmtime);
    if ($log ne $LOG) {
        close LOG;
	openlog;
    }
    my $date = strftime ('%Y-%m-%d %T', gmtime);
    print LOG "$date [$$] ", @_, "\n";
    LOG->flush;
}

# Log a fatal message and then exit.
sub logdie {
    log ('FATAL: ', @_);
    exit 1;
}

##############################################################################
# control.ctl parsing and analysis
##############################################################################

# Read in control.ctl, converting the group patterns to regular expressions
# (still separated by colons), and returning a reference to an array of
# control lines.  Reverse the array so that subsequent code can use the first
# matching pattern.
sub read_ctl {
    my @rules;
    open (CONTROL, $CONTROL) or logdie "cannot open $CONTROL: $!";
    local $_;
    while (<CONTROL>) {
        s/^\s+//;
        s/\s+$//;
        next if /^\#/;
        next if /^$/;
        my ($type, $sender, $groups, $action) = split /:/;
        next unless $action;
        next unless $type =~ /^(?:(?:new|rm)group|checkgroups)$/;
        $action = 'drop' if $action eq 'mail';

        # Convert patterns to regexes.
        for ($sender, $groups) {
            s/([\$+.])/\\$1/g;
            s/\*/.*/g;
            s/\?/./g;
            $_ = '^' . $_ . "\$";
            s/\|/\$|^/g;
        }
        push (@rules, join (':', $type, $sender, $groups, $action));
    }
    close CONTROL;
    return [ reverse @rules ];
}

# Given a reference to an array of rules, the message ID, a sender, a control
# message type, and a target, return a list of the action (one of doit,
# verify-*, or mail) and the matching pattern.  The second is used for
# checkgroups.  The target for a checkgroups doesn't come from the control
# message and instead is the first group in the body of the checkgroups.
sub check_ctl {
    my ($rules, $id, $sender, $type, $target) = @_;
    my $pattern;
    my $action = 'drop';
    for (@$rules) {
        my @rule = split /:/;
        if ($type eq $rule[0] && $sender =~ /$rule[1]/) {
            next unless (defined ($target) && $target =~ /$rule[2]/);
            $action = $rule[3];
            $pattern = $rule[2];
            last;
        }
    }
    return ($action, $pattern);
}

##############################################################################
# Article parsing
##############################################################################

# Check a wide variety of things about the headers.  Returns the log message
# to use to reject the message if it's bad, or the empty string otherwise.
# (This is an odd return value, but it makes the code a lot shorter.)  Takes
# the message ID and a ref to the header hash.
sub check_headers {
    my ($id, $hdr) = @_;

    for (qw/approved control from message-id subject/) {
        return "$id missing header $_" unless exists $$hdr{$_};
    }
    return "$id message ID mismatch with $$hdr{'message-id'}"
        if ($id && $$hdr{'message-id'} ne $id);
    $id = $$hdr{'message-id'};

    # Check the control header.
    my @control = split (' ', $$hdr{control});
    return "$id unknown control type $control[0]"
        if ($control[0] !~ /^(?:(?:new|rm)group|checkgroups)$/);
    return "$id invalid control header"
        if (!@control);
    return "$id invalid control header"
        if ($control[0] eq 'rmgroup' && @control > 2);
    return "$id missing newsgroup for $control[0]"
        if ($control[0] =~ /^(?:new|rm)group$/ && !$control[1]);
    if ($control[0] eq 'checkgroups') {
        my @info = @control[1 .. $#control];
        for (@info) {
            return "$id bad syntax for checkgroups: $_"
                unless /^(\#\d+|!?[a-zA-Z0-9+_-]+(\.[a-zA-Z0-9+_-]+)*)\z/;
        }
    } elsif ($control[0] eq 'newgroup') {
        return "$id bad newgroup mode $control[2]"
            if ($control[2] && $control[2] !~ /^[ym]/);
        return "$id bad newsgroup name $control[1] (invalid characters)"
            unless ($control[1] =~ /^[a-z0-9+_.-]+$/);
        return "$id bad newsgroup name $control[1] (empty component)"
            if ($control[1] =~ /^\.|\.\.|\.$/);
        return "$id bad newsgroup name $control[1] (bad initial character)"
            unless ($control[1] =~ /^[a-z]/);
        return "$id bad newsgroup name $control[1] (too long)"
            if (length ($control[1]) > 80);
        return "$id bad newsgroup name $control[1] (one component)"
            unless ($control[1] =~ /\./);
        return "$id bad newsgroup name $control[1] (reserved hierarchy)"
            if ($control[1] =~ /^(?:control|to|example)\./);
        my @components = split (/\./, $control[1]);
        for (@components) {
            return "$id bad component in $control[1] (all numeric)"
                if /^\d+$/;
            return "$id bad component in $control[1] (all or ctl)"
                if /^(?:all|ctl)$/;
            return "$id bad component in $control[1] (bad initial character)"
                unless /^[a-z0-9]/;
        }
    }
    return '';
}

# Parse an article into a hash of headers and an array of body lines.  For
# certain key headers, also ensure that the headers only occur once in the
# message.  Takes the file handle to the article, its message ID, an array ref
# into which to put the headers, an array ref into which to put the body, a
# hash ref into which to put parsed headers, and a hash ref into which to put
# duplicated headers.
sub parse_article {
    my ($fh, $id, $headers, $body, $hdr, $dups) = @_;
    my %unique = map { $_ => 1 }
        qw(approved control date from message-id subject sender x-pgp-sig);

    # Parse the headers first, checking for duplicates as we go.
    my $last;
    local $_;
    while (<$fh>) {
        s/\r\n\z/\n/;
        last if ($_ eq "\n");
        push (@$headers, $_);
        if (/^(\S+):\s+(.+)/) {
            $last = lc $1;
            if (exists $$hdr{$last}) {
                if ($unique{$last}) {
                    log "$id multiple $1 headers";
                    return 0;
                }
                $$hdr{$last} .= ' ' . $2;
                $$dups{$last}++;
            } else {
                $$hdr{$last} = $2;
            }
        } elsif (defined ($last) && /^(\s.+)/) {
            $$hdr{$last} .= $1;
        } else {
            log "$id broken headers";
            return 0;
        }
    }

    # Check the validity of the errors.
    unless (@$headers) {
        log "$id appears to be empty";
        return 0;
    }
    my $error = check_headers ($id, $hdr);
    if ($error) {
        log $error;
        return 0;
    }

    # Slurp up the article body.
    while (<$fh>) {
        s/\r\n\z/\n/;
        last if ($_ eq ".\n");
        s/^\.\././;
        push (@$body, $_);
    }
    return 1;
}

# Produce a cleaned up version of the e-mail address of the sender, for
# checking against control.ctl.
sub clean_from {
    local $_ = shift;
    s/(\s+)?\(.*\)(\s+)?//g;
    s/^.*<(.*)>.*/$1/;

    # Protect against weird characters.
    s/[^-a-zA-Z0-9+_.@%]/_/g;
    s/^-/_/;
    return $_;
}

# Find the first valid group in a checkgroups message.  Takes a reference to
# the body of the message.
sub checkgroups_group {
    my $body = shift;
  LINE:
    for (@$body) {
        my ($group, $desc) = split (' ', $_, 2);
        $desc =~ s/\s+$// if defined $desc;
        next unless $desc;
        next if $desc =~ /[\x00-\x1f]/;
        next unless ($group =~ /^[a-z0-9+_.-]+$/);
        next unless $group =~ /\./;
        next if ($group =~ /^\.|\.\.|\.$/);
        next if length ($group) > 80;
        my @components = split (/\./, $group);
        for (@components) {
            next LINE if /^\d+$/;
            next LINE if /^(?:all|ctl)$/;
            next LINE if length ($_) > 30;
        }
        return $group;
    }
}

##############################################################################
# PGP verification
##############################################################################

# Parse the X-PGP-Sig header, returning the version, the signature, and a list
# of signed headers.  Returns an empty list if the header doesn't parse
# correctly.
sub parse_pgp_sig {
    local $_ = shift;

    # This is a complex regex, so break it up into pieces.  An X-PGP-Sig
    # header consists of a version, a separator, a comma-separated list of
    # headers included in the signature, a separator, and then some number of
    # separated base64 lines.  The last line starts with =, and the second to
    # the last line may end with one or more = characters.  Each base64 line
    # except for the last should contain exactly 64 characters and the last
    # should contain 4 characters after the =.
    #
    # $s is a separator, which is any number of spaces and tabs which can
    # additionally be folded.
    my $s = '[ \t]*(?:\n?[ \t]+)+';

    # $b64 matches a single base64 character.
    my $b64 = '[a-zA-Z0-9+/]';

    # Now actually parse the header.  Require that the headers argument
    # contain at least one comma so that we can more easily diagnose a missing
    # version header.
    return unless /^(\S+)$s(\S+,\S+)(($s$b64{64})+$s$b64+=?=?$s=$b64{4})$/;

    # Return the appropriate bits of the header.
    my ($version, $headers, $signature) = ($1, $2, $3);
    $signature =~ s/$s/\n/g;
    return ($version, $signature, split (/,/, $headers));
}

# Generate the PGP message that should be verified.  Takes the version, a
# reference to a list of headers that should be included in the signed
# version, the signature, a ref to the hash of article headers, and a ref to
# the array of body lines.  Returns the PGP message and the signature as a
# list.
sub generate_pgp {
    my ($version, $headers, $signature, $hdr, $body) = @_;
    my $message = "X-Signed-Headers: " . join (',', @$headers) . "\n";
    for my $header (@$headers) {
        my $label = lc $header;
        $message .= "$header: ";
        $message .= "$$hdr{$label}" if $$hdr{$label};
        $message .= "\n";
    }
    $message .= "\n";
    $message .= join ('', @$body);
    $message =~ s/[ \t]+\n/\n/g;
    my $sig = "\n-----BEGIN PGP SIGNATURE-----\n";
    $sig .= "Version: $version\n";
    $sig .= $signature;
    $sig .= "\n-----END PGP SIGNATURE-----\n";
    return ($message, $sig);
}

# Write the message to a file for checking.
sub write_message {
    my ($message, $file) = @_;
    open (TMP, "> $file") or logdie "cannot create $file: $!";
    print TMP $message;
    close TMP or logdie "cannot flush $file: $!";
    logdie "write error to $file" if (-s $file != length $message);
}

# Run GnuPG on a saved temporary file and return the key ID that signed the
# message.  Takes the message ID for error reporting and file as an argument
# and gets the keyring and path to GnuPG from global variables.
sub gpg_check {
    my ($id, $file, $sigfile) = @_;

    # Run GnuPG and make sure it exits successfully.
    my $opts = "--verify --quiet --status-fd=1 --logger-fd=1";
    $opts .= " --keyring=$KEYRING --allow-weak-digest-algos";
    open (GPG, "$GPGV $opts $sigfile $file 2>/dev/null |")
        or logdie "cannot execute $GPGV: $!";
    local $/;
    $_ = <GPG>;
    unlink $file or log "ERROR: cannot unlink $file: $!";
    unlink $sigfile or log "ERROR: cannot unlink $sigfile: $!";
    close GPG;

    # Check the results.
    my $signer;
    if ($? != 0) {
        if (/\n/) {
            my $firstline = $_;
            $firstline =~ s/\n.*//s;
            log "$id PGP: $firstline";
        }
        if ($? >> 8) {
            log "$id PGP: gpgv exited with status " . ($? >> 8);
        } else {
            log "$id PGP: gpgv died on signal " . ($? & 255);
        }
    } else {
        if (/\[GNUPG:\]\s+GOODSIG\s+\S+\s+(.+)/) {
            $signer = $1;
        } else {
            my $firstline = $_;
            $firstline =~ s/\n.*//s;
            log "$id PGP: $firstline";
        }
    }
    return $signer;
}

# Given the message ID, a hash of the article headers, the hash of which
# headers are duplicated, and the array containing the body, verify the PGP
# signature and return the signer.  If the message isn't properly signed, log
# the error and return undef.
sub pgpverify {
    my ($id, $hdr, $dups, $body) = @_;

    # Make sure the article is a valid signed message.
    unless ($$hdr{'x-pgp-sig'}) {
        log "$id missing X-PGP-Sig header";
        return;
    }
    my ($version, $signature, @headers) = parse_pgp_sig $$hdr{'x-pgp-sig'};
    unless ($signature) {
        log "$id invalid X-PGP-Sig header (missing version?)";
        return;
    }
    for (@headers) {
        if ($$dups{lc $_}) {
            log "$id signed header $_ is duplicated";
            return;
        }
    }
    my %headers = map { lc ($_) => 1 } @headers;
    for (qw/control date/) {
        unless ($headers{$_}) {
            log "$id $_ header not signed";
            return;
        }
    }

    # Check the signature.
    my ($message, $sigfile)
        = generate_pgp ($version, \@headers, $signature, $hdr, $body);
    write_message ($message, "$TMPDIR/$$");
    write_message ($sigfile, "$TMPDIR/$$.asc");
    my $signer = gpg_check ($id, "$TMPDIR/$$", "$TMPDIR/$$.asc");
    if ($signer && $signer =~ /\s/ && $signer =~ /</) {
        $signer =~ s/^.*?<([^>]+)>.*/$1/;
    }
    return $signer;
}

##############################################################################
# Control message actions
##############################################################################

# Process a newgroup.  Takes the message ID, the name of the group, the mode,
# and the body of the message (to find the description) and updates the
# database appropriately.  Logs the results of the action.
sub newgroup {
    my ($id, $group, $mode, $body) = @_;
    $mode = $mode ? substr ($mode, 0, 1) : 'y';

    # Locate and parse the description out of the body.  newgroup control
    # messages without descriptions will not be acted on.
    my ($next, $description);
    for (@$body) {
        if ($next && /^\Q$group\E\s+(\S.+)/) {
            $description = $1;
            last;
        } elsif ($_ =~ /^For your newsgroups file:\s*$/) {
            $next = 1;
        } else {
            $next = 0;
        }
    }
    if (defined ($description) && $description =~ /[\x00-\x1f]/) {
        log "$id control characters in newsgroup description";
        return;
    }
    unless ($description) {
        log "$id no description in body of newgroup for $group";
        return;
    }
    $description =~ s/[\x00-\x1f]//g;
    if ($mode eq 'm' && $description !~ /\(Moderated\)$/) {
        $description =~ s/\s*\(Moderated\)//;
        $description =~ s/\s*$/ (Moderated)/;
    } elsif ($mode ne 'm' && $description =~ /\(Moderated\)$/) {
        $description =~ s/\s*\(Moderated\)$//;
    }

    # Check any extra restrictions we put on the newsgroup name for control
    # messages we still want to archive.
    my @components = split (/\./, $group);
    for (@components) {
        if (length ($_) > 30) {
            log "$id bad component in $group (too long)";
            return;
        }
    }

    # Now, actually update the database.
    my %db;
    open (LOCK, "+> $ACTIVE.lock") or logdie "cannot open $ACTIVE.lock: $!";
    flock (LOCK, LOCK_EX) or logdie "cannot lock $ACTIVE.lock: $!";
    tie (%db, 'DB_File', $ACTIVE) or logdie "cannot tie $ACTIVE: $!";
    if (!$db{$group}) {
        log "ACTION: newgroup $group $mode";
    } else {
        my @old = split (' ', $db{$group}, 2);
        if ($old[0] ne $mode) {
            log "ACTION: changegroup $group from $old[0] to $mode";
        } elsif ($old[1] ne $description) {
            log "ACTION: changedesc $group";
        }
    }
    $db{$group} = "$mode $description";
    untie %db;
    close LOCK;
    log "$id processed newgroup for $group";
}

# Process an rmgroup.  Takes the message ID, the name of the group, and logs
# the results of the action.
sub rmgroup {
    my ($id, $group) = @_;
    my %db;
    open (LOCK, "+> $ACTIVE.lock") or logdie "cannot open $ACTIVE.lock: $!";
    flock (LOCK, LOCK_EX) or logdie "cannot lock $ACTIVE.lock: $!";
    tie (%db, 'DB_File', $ACTIVE) or logdie "cannot tie $ACTIVE: $!";
    if ($db{$group}) {
        log "ACTION: rmgroup $group";
        delete $db{$group};
    }
    untie %db;
    close LOCK;
    log "$id processed rmgroup for $group";
}

# Process a checkgroups.  Takes the group pattern affected, the body of the
# message, and the checkgroup arguments.  Logs all actions taken.
sub checkgroups {
    my ($id, $pattern, $body, @args) = @_;
    @args = grep { !/^\#/ } @args;
    my $include = join ('|', grep { /^[^!]/ } @args);
    $include = qr/^(?:$include)(?:\.|\z)/ if $include;
    my $exclude = join ('|', map { s/^\!//; $_ } grep { /^!/ } @args);
    $exclude = qr/^(?:$exclude)(?:\.|\z)/ if $exclude;
    my %checkgroups;

    # First, process the body of the checkgroups and build a hash of valid
    # groups in the hierarchy in the same format as our database.
  LINE:
    for (@$body) {
        next unless /$pattern/;
        my ($group, $desc) = split (' ', $_, 2);
        $desc =~ s/\s+$//;
        next unless $desc;
        next if $desc =~ /[\x00-\x1f]/;
        next unless ($group =~ /^[a-z0-9+_.-]+$/);
        next if ($group =~ /^\.|\.\.|\.$/);
        next if length ($group) > 80;
        my @components = split (/\./, $group);
        for (@components) {
            next LINE if /^\d+$/;
            next LINE if /^(?:all|ctl)$/;
            next LINE if length ($_) > 30;
        }
        if ($include) {
            next unless ($group =~ /$include/);
        }
        if ($exclude) {
            next if ($group =~ /$exclude/);
        }
        my $mode = ($desc =~ /\(Moderated\)$/) ? 'm' : 'y';
        $checkgroups{$group} = "$mode $desc";
    }

    # Now, open the database and take a first pass through all groups for that
    # same hierarchy that are in the database, match @include, and don't match
    # @exclude, fixing modes and descriptions and removing any groups that
    # aren't in the new checkgroups.
    my %db;
    open (LOCK, "+> $ACTIVE.lock") or logdie "cannot open $ACTIVE.lock: $!";
    flock (LOCK, LOCK_EX) or logdie "cannot lock $ACTIVE.lock: $!";
    tie (%db, 'DB_File', $ACTIVE) or logdie "cannot tie $ACTIVE: $!";
    my ($group, $old);
    while (($group, $old) = each %db) {
        next unless $group =~ /$pattern/;
        if ($include) {
            next unless $group =~ /$include/;
        }
        if ($exclude) {
            next if $group =~ /$exclude/;
        }
        if (!$checkgroups{$group}) {
            log "ACTION: rmgroup $group";
            delete $db{$group};
        } elsif ($db{$group} ne $checkgroups{$group}) {
            my @old = split (' ', $db{$group}, 2);
            my ($mode, $description) = split (' ', $checkgroups{$group}, 2);
            if ($old[0] ne $mode) {
                log "ACTION: changegroup $group from $old[0] to $mode";
            } elsif ($old[1] ne $description) {
                log "ACTION: changedesc $group";
            }
            $db{$group} = $checkgroups{$group};
        }
    }

    # Now, take a final pass through the checkgroups to add any groups that we
    # were missing.
    for (keys %checkgroups) {
        unless ($db{$_}) {
            my ($mode, $description) = split (' ', $checkgroups{$_}, 2);
            log "ACTION: newgroup $_ $mode";
            $db{$_} = $checkgroups{$_};
        }
    }
    untie %db;
    close LOCK;
    log "$id processed checkgroups for $pattern";
}

##############################################################################
# Archiving
##############################################################################

# Archive a particular message in a file.  Takes the file name, the headers
# array, and the body array.
sub archive_message {
    my ($file, $headers, $body) = @_;
    my $umask = umask 002;
    my $message = join ('', @$headers) . "\n" . join ('', @$body);
    $message = "From usenet " . scalar (gmtime) . "\n" . $message;
    open (LOCK, "+> $ARCHIVE/.lock")
        or logdie "cannot open $ARCHIVE/.lock: $!";
    flock (LOCK, LOCK_EX) or logdie "cannot lock $ARCHIVE/.lock: $!";
    if (-f "$file.gz") {
        $message = "\n" . $message;
        my $old = gzopen ("$file.gz", 'r')
            or logdie "cannot open $file.gz: $!";
        my $new = gzopen ("$file.new.gz", 'w')
            or logdie "cannot create $file.new.gz: $!";
        my ($buffer, $bytes);
        while (($bytes = $old->gzread ($buffer, 64 * 1024)) > 0) {
            if ($new->gzwrite ($buffer) != $bytes) {
                logdie "cannot write to $file.new.gz: $!";
            }
        }
        $old->gzclose;
        if ($new->gzwrite ($message) != length $message) {
            logdie "cannot write to $file.new.gz: $!";
        }
        $new->gzclose;
    } else {
        my $new = gzopen ("$file.new.gz", 'w')
            or logdie "cannot create $file.gz: $!";
        if ($new->gzwrite ($message) != length $message) {
            logdie "cannot write to $file.new.gz: $!";
        }
        $new->gzclose;
    }
    rename ("$file.new.gz", "$file.gz")
        or logdie "cannot rename $file.new.gz: $!";
    close LOCK;
    umask $umask;
}

# Archive a control message.  Write the control message as a compressed file,
# and deal with the possibility that an archive for that control message
# already exists.  Uses a lock file at the top of the archive directory to
# prevent any contention.  Takes the message ID, the name of the affected
# group and the raw header and body arrays.
sub archive {
    my ($id, $group, $headers, $body) = @_;
    my $dir = $group;
    $dir =~ s/\..*//;
    logdie "bad newsgroup name $group" if (!$dir || $dir =~ m%^\.|^-|/%);
    $dir = "$ARCHIVE/$dir";
    unless (-d $dir) {
        my $umask = umask 002;
        mkdir $dir or logdie "cannot create $ARCHIVE/$dir: $!";
        umask $umask;
    }
    archive_message ("$dir/$group", $headers, $body);
    log "$id archived as $group";
}

##############################################################################
# Main routine
##############################################################################

# Trim extraneous garbage from the path.
my $fullpath = $0;
$0 =~ s%.*/%%;

# Get permissions right.
umask 007;

# Open the log file.
openlog;

# Read in control.ctl.  We only do this once.
my $rules = read_ctl;

# Processing loop.  We listen on stdin and get file names and message IDs, one
# set per line, separated by spaces.
my ($file, $id);
while (<STDIN>) {
    ($file, $id) = split;

    # Don't process any files over 256KB for a variety of reasons.  (Big Eight
    # checkgroups messages are about 180KB.)
    my $size = (-s $file);
    if (!$size || $size > 256 * 1024) {
        $size = 0 unless $size;
        log "$id has invalid size ($size)";
        next;
    }

    # Parse the article.
    open (ART, $file) or logdie "cannot open article $file: $!";
    my (@headers, @body, %hdr, %dups);
    next unless parse_article (\*ART, $id, \@headers, \@body, \%hdr, \%dups);
    $id = $hdr{'message-id'} unless $id;
    my $sender = clean_from ($hdr{sender} || $hdr{from});
    my @control = split (' ', $hdr{control});

    # Discard newgroup and rmgroup messages over 64KB, since they'll also be
    # archived, and I don't really want to archive tons of crap.
    if ($control[0] =~ /^(?:new|rm)group$/ && $size > 64 * 1024) {
        log "$id $control[0] too large ($size)\n";
        next;
    }

    # Check to see what we want to do with it.  If this is a checkgroups,
    # figure out what hierarchy it's affecting.  This should really be
    # based on the checkgroups arguments if present, but isn't yet.
    my ($action, $pattern);
    if ($control[0] eq 'checkgroups') {
        my $group = checkgroups_group (\@body);
        ($action, $pattern)
            = check_ctl ($rules, $id, $sender, 'checkgroups', $group);
    } else {
        ($action, $pattern) = check_ctl ($rules, $id, $sender, @control);
    }
    if ($action =~ /^verify-(.+)/) {
        my $wanted = $1;
        my $signer = pgpverify ($id, \%hdr, \%dups, \@body);
        if (defined ($signer) && $signer eq $wanted) {
            $action = 'doit';
        } else {
            $action = 'drop';
        }
    }

    # Act on the results.
    if ($action eq 'doit') {
        if ($control[0] eq 'newgroup') {
            newgroup ($id, $control[1], $control[2], \@body);
        } elsif ($control[0] eq 'rmgroup') {
            rmgroup ($id, $control[1]);
        } elsif ($control[0] eq 'checkgroups') {
            checkgroups ($id, $pattern, \@body, @control[1 .. $#control]);
        } else {
            log "$id unknown control type $control[0]";
        }
    } elsif ($action ne 'drop') {
        log "$id internal: unknown action $action";
    }

    # We archive anything that parsed correctly as a newgroup or rmgroup, even
    # if it failed PGP verification.  This should perhaps be rethought later.
    if ($control[0] =~ /^(new|rm)group$/) {
        archive ($id, $control[1], \@headers, \@body);
    } elsif ($control[0] eq 'checkgroups') {
        unless (-d "$ARCHIVE/other.ctl") {
            mkdir "$ARCHIVE/other.ctl"
                or logdie "cannot create $ARCHIVE/other.ctl: $!";
        }
        my $year = (gmtime)[5] + 1900;
        archive_message ("$ARCHIVE/other.ctl/checkgroups.$year", \@headers,
                         \@body);
        log "$id archived checkgroups";
    }
} continue {
    unlink $file or logdie "cannot unlink article $file: $!";
}

##############################################################################
# Documentation
##############################################################################

=head1 NAME

process-control - Process Usenet control messages to maintain a database

=head1 SYNOPSIS

process-control

=head1 DESCRIPTION

B<process-control> processes Usenet control messages, applying the rules
from a F<control.ctl> file in the same format used by INN, and maintains a
database of active groups with descriptions.  It also archives control
messages in gzip format in a central archive and logs all of its actions
to a log file.

It expects to receive on standard input file names and message IDs, one
pair of file name and message ID per line.  The message ID is optional,
but is used for logging until the message has been parsed and therefore
omitting it may result in log messages that have no associated ID.  The
file is deleted when it has been successfully processed.

Messages are considered invalid and are discarded (with a log message) if:

=over 2

=item *

The message is larger than 256KB, or the message is not a checkgroups
control message and is larger than 64KB.

=item *

The message is syntactically invalid.  This means that it's empty or it
contains a line in the headers that doesn't fit the syntax of the headers
(a sequence of printable, non-space ASCII characters followed by a colon
and at least one space, or a line beginning with whitespace indicating a
continuation of the previous header).

=item *

Any of the headers Approved, Control, From, Message-ID, or Subject are
missing, or any of the headers Approved, Control, Date, From, Message-ID,
Subject, Sender, or X-PGP-Sig are duplicated.

=item *

The message is not a newgroup, rmgroup, or checkgroups control message, or
the Control header doesn't follow the correct syntax for that type of
control message.  An rmgroup control header must have exactly one
argument, the name of the newsgroup to remove.  A newgroup control message
may have two or three; the second must be the name of the newsgroup to
create and the third, if present, must indicate the group mode.

=item *

The message is a newgroup control message and the newsgroup mode does not
begin with either C<y> (unmoderated) or C<m> (moderated).

=item *

The message is a newgroup control message and the group name to be created
is longer than 80 octets, contains a character other than those from the
set [a-z0-9+_.-], begins or ends with a period, does not contain a period,
contains two consecutive periods, begins with a character other than a
lowercase letter, starts with C<control.>, C<example.>, or C<to.>,
contains a name component (delimited by periods) that is entirely numeric,
contains a name component (delimited by periods) equal to C<all> or C<ctl>
(these have special meaning for older servers), or contains a name
component that begins with a character other than a digit or letter.

=back

All other messages will be archived.  newgroup and rmgroup control
messages will be archived in a file named I<hierarchy>/I<group>.gz, where
I<group> is the affected group and I<hierarchy> is its first component
(the part up to the first period in the name, if any).  checkgroups
control messages will be archived in other.ctl/checkgroups.I<year> where
I<year> is the current year.

Messages will also be checked against the rules from a F<control.ctl> file
and messages that, according to the rules in that file, should be acted on
will result in updates to a database of active groups.  That database is a
Berkeley DB hash file with keys equal to the group names and values of the
group mode, a single space, and then the group description.  It is locked
against concurrent updates.

Messages that F<control.ctl> indicates need to be checked using PGP will
be passed through GnuPG using the same algorithm as pgpverify and will
only be processed if the signer matches the required signer in
F<control.ctl>.

newgroup messages must have a C<For your newsgroups file:> line in the
body, followed by a line suitable for a checkgroups message (the group
name, some whitespace, and the description, containing no control
characters).  C<(Moderated)> will be added to the description if not
present for a moderated group, and will be removed if present for an
unmoderated group.  For a newgroup message to be acted on, it must also
not contain any component longer than 30 characters.

The same constraints on group names mentioned above are also applied to
the group names in the body of a checkgroups message, and any groups not
fitting those requirements will be ignored, as will any group that does
not have a description.  In a checkgroups message, the moderation status
of the group will be determined by the presence or absence of
C<(Moderated)> at the end of the description.

Even if a newsgroup already exists, its description and mode will be
checked and updated by any applicable newgroup or checkgroups message.

All changes made to the database will be logged, starting with the word
C<ACTION:>.

=head1 FILES

=over 4

=item F</srv/control/active.db>

The database of active groups, updated as described above.  It is locked
against simultaneous access by using fcntl locking on a file by the same
name but with C<.lock> appended.

=item F</srv/control/archive>

The root of the control message archive.  It is locked against
simultaneous writers by using fcntl locking on a file named C<.lock> at
the top level.

=item F</srv/control/control.ctl>

The F<control.ctl> file used to determine which control messages should be
applied to the active newsgroups database.

=item F</srv/control/keyring/pubring.gpg>

The keyring used to verify PGP-signed control messages.  The user IDs on
the keys stored in this keyring must match the user IDs expected according
to the rules in F<control.ctl>.

=item F</srv/control/logs/log.%Y-%m>

Where actions are logged.  %Y is replaced by the current four-digit year
and %m by the current two digit month.

=item F</srv/control/tmp>

Used for temporary files for PGP verification.

=back

=head1 AUTHORS

Written by Russ Allbery <eagle@eyrie.org>, based very heavily on controlchan
by Marco d'Itri.  Portions based on pgpverify by David Lawrence.

=head1 SEE ALSO

export-control(1), update-control(1)

This script is part of the control-archive package.  The control-archive web
page at L<https://www.eyrie.org/~eagle/software/control-archive/> will have
the current version of the package.

=cut
