#! /usr/bin/perl
#
# amavis-stats -- generate rrds from amavis log output
#
# Copyright (C) 2003, Mark Lawrence (nomad@null.net)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License with
# the Debian GNU/Linux distribution in file /usr/share/common-licenses/GPL;
# if not, write to the Free Software Foundation, Inc., 59 Temple Place,
# Suite 330, Boston, MA  02111-1307  USA
#
# On Debian systems, the complete text of the GNU General Public
# License, version 2, can be found in /usr/share/common-licenses/GPL-2.


# ########################################################################
# Dependencies
# ########################################################################
use strict;
use warnings;
use Getopt::Std;
use Time::localtime;
use Time::Local;
use RRDs;
use Fcntl ':flock';
use POSIX qw(strftime);
use POSIX qw(locale_h);


# ########################################################################
# Globals
# ########################################################################
our (
    $me,         # this program name
    $version,    # this program version
    $debug,      # result of the -d flag
    $verbose,    # result of the -v flag
    $locale,     # use setlocale to determine language of dates?
    $pkg,        # name of this package (usually amavis-stats)
    $statedir,   # location of rrd files
    $lockfile,   # lock file to prevent more than one invocation at a time
    $logfile,    # input log file on command line
    $statefile,  # between invocation status file
    $namesfile,  # mappings of IDs to virus names
    $countfile,  # mappings of IDs to virus occurences
    $seenfile,   # mappings of IDs to virus first/last seen times
    $rrdstep,    # rrd step size
    $spos,       # start position of input file this run
    $pos,        # current position in input file
    $eof,        # position of end of input file
    $line,       # string containing current line
    $epoch,      # seconds since 1970
    $lastepoch,  # seconds since 1970, previous time around
    $numv,       # number of virus seen, continually incrementing
    $lastupdate, # epoch of last global rrd update
    $year,       # current year
    %months,     # locale-dependent hash of months-to-index
    %rvid,       # in-memory mapping of virus names to IDs
    %vnames,     # in-memory mapping of IDs to virus names
    %occurence,  # in-memory mapping of IDs to counts/occurences
    %firstseen,  # in-memory mapping of virus first seen times
    %lastseen,   # in-memory mapping of virus last seen times
    %spamsess,   # in-memory store of session ids (amavis process-count field)
                 # used for SPAM-TAG
    %opt         # command line options
);


# ########################################################################
# Initial values & Constants
# ########################################################################
($me = $0) =~ s%.*/%%;    # get rid of the leading directory
$version   = "0.1.12"; # this value is auto-updated by packing system
$pkg       = "amavis-stats";
$locale    = "C";
$lockfile  = "/var/lock/$pkg";
$statedir  = "/var/lib/$pkg";
$statefile = "$statedir/$pkg.state"; # last read position of the logfile
$namesfile = "$statedir/$pkg.names"; # stores the virus name to id mappings
$countfile = "$statedir/$pkg.count"; # per virus totals
$seenfile  = "$statedir/$pkg.seen";  # first and last time() seen
$rrdstep   = 300;


# ########################################################################
# Subroutines
# ########################################################################

#
# Message about this program and how to use it
#
sub usage() {
    print STDERR "usage: $0 [-hVqdrl] file\n";
}

sub help() {
    usage();
    print STDERR << "EOF";

    Version: $version

    This program generates virus infection statistics from amavis/syslog
    log files. It is typically called from cron(8), but can also be used
    from the command line when populating the databases with historical
    data.

    -h        : this (help) message
    -v        : does nothing (legacy verbose option)
    -q        : quiet mode - no output
    -d        : print debugging messages to stderr
    -r        : reset file pointer to 0, instead of starting at last position
    -l        : take locale from the environment (instead of English)
    -V        : display version and exit

    examples:

    Initial import of existing data:
    amavis# $me /var/log/mail.info.2 
    amavis# $me -r /var/log/mail.info.1 
    amavis# $me -r /var/log/mail.info.0 
    amavis# $me -r /var/log/mail.info 

    Normal usage:
    amavis# $me /var/log/amavis.log 

EOF

}


#
# Command line options processing
#
sub init()
{
    dbg("$me version $version");

    my $opt_string = 'hvqdf:lrV';
    if (!getopts( "$opt_string", \%opt )) {
        usage();
        do_exit(1);
    }

    if ($opt{h}) {
        help();
        do_exit(1);
    }

    if ($opt{V}) {
        print "$version\n";
        do_exit(1);
    }

    $verbose = 1;
    $verbose = 0 if ($opt{q});
    $debug   = 1 if $opt{d};


    if ($opt{f}) { # legacy way to specify input file
        $logfile = $opt{f};
        if ( ! -f $logfile ) {
            do_exit(1, "file \"$logfile\" does not exist");
        }

    } elsif ($ARGV[0]) { # now expect file on command line
        $logfile = $ARGV[0];
        if ( ! -f $logfile ) {
            do_exit(1, "file \"$logfile\" does not exist");
        }

    } else {
        usage();
        do_exit(1);
    }

    $year    = localtime->year() + 1900;

    # build default (English?) hash of Month-to-Numbers
    %months = (
        "Jan" => "0", "Feb" => "1",  "Mar" => "2",  "Apr" => "3",
        "May" => "4", "Jun" => "5",  "Jul" => "6",  "Aug" => "7",
        "Sep" => "8", "Oct" => "9", "Nov" => "10", "Dec" => "11"
    );

    # Set up a locale-depenedent hash of Month-to-Numbers
    if ($opt{l}) {
        $locale = setlocale(LC_TIME);
        dbg("locale is set to \"$locale\"");
        for (0..11) {
            my $tmp = strftime("%b", 0, 0, 0, 1, $_, 96);
            dbg("$tmp");
            $months{$tmp} = $_;
        }
    }

}


#
# Make sure that only one copy is running at a time
#
sub semlock {
    open (LOCKF, ">$lockfile") or do_exit(1, "Could not open $lockfile: $!");
    unless (flock(LOCKF, LOCK_EX | LOCK_NB)) {
        err("warning: Could not lock $lockfile: $!");
        sleep 2;
        unless (flock(LOCKF, LOCK_EX | LOCK_NB)) {
            do_exit(1, "Could not lock $lockfile: $!");
        }
    }
    print LOCKF "$$\n";
    dbg("Have lock on $lockfile");
}


#
# Undo our lock. This is only for the sake of completeness - all file
# handles are closed (and locks lost) on program exit anyway.
#
sub semunlock {
    close LOCKF;
    if (unlink("$lockfile")) {
        dbg("lock $lockfile removed");
    }
}


#
# For completeness, remove the lock before exiting, otherwise the lock
# file remains untidily behind...
#
sub do_exit {
    my ($code, $msg) = @_;
    semunlock();

    if ($code == 0) {
        exit 0;
    } else {
        if (defined $msg) {
            print STDERR "$me: $msg\n";
        }
        exit $code;
    }
}


#
# Load the values of the previous run into variables
#
sub loadState {
    dbg("loadState()");
    $spos = undef;

    #
    # Check that we have somewhere to save our status - Not much point
    # in continuing otherwise.
    #
    if ((! -d "$statedir") or (! -w "$statedir")) {
        do_exit(1,"$statedir does not exist or cannot be written to.");
    }


    #
    # Grab the previous position reached in the log file, plus
    # the total number of different viruses we have seen
    #
    if (-f "$statefile") {

        dbg("opening file $statefile");

        open (IN, "$statefile") or die "Could not open $statefile";
        while (my $line = <IN>) {
            if ($line =~ /^pos:\s*(\d+)/) {
                $spos = $1;
            }
            elsif ($line =~ /^numv:\s*(\d+)/) {
                $numv = $1;
            }
            elsif ($line =~ /^lastupdate:\s*(\d+)/) {
                $lastupdate = $1;
            }
            elsif ($line =~ /^spamsess:\s*(.*)/) {
                my @arr = split(/\s+/, $1);
                foreach (@arr) {
                    $spamsess{$_} = 1;
                }
            }
        }
        close IN;

        dbg("opening file $namesfile");
        open (IN, "$namesfile") or die "Could not open $namesfile";
        while (my $line = <IN>) {
            if ($line =~ /^(\d+)\s+(.*)/) {
                my $id = $1;
                my $name = $2;
                if ($name =~ /^spam$/) { # from version 0.1.12 names changed
                    $name = "Not-Delivered(SPAM)";
                } elsif ($name =~ /^passed$/) {
                    $name = "Passed";
                } elsif ($name =~ /^banned$/) {
                    $name = "Banned";
                } elsif ($name =~ /^infected$/) {
                    $name = "Infected";
                }
                $rvid{$name} = $id;
                $vnames{$id} = $name;
            }
        }
        close IN;

        dbg("opening file $countfile");
        open (IN, "$countfile") or die "Could not open $countfile";
        while (my $line = <IN>) {
            if ($line =~ /^(\d+)\s+(\d+)/) {
                $occurence{$1} = $2;
            }
        }
        close IN;

        dbg("opening file $seenfile");
        open (IN, "$seenfile") or die "Could not open $seenfile";
        while (my $line = <IN>) {
            if ($line =~ /^(\d+)\s+(\d+)\s+(\d+)/) {
                $firstseen{$1} = $2;
                $lastseen{$1}  = $3;
            }
        }
        close IN;
    }

    #
    # If we have not run before reset...
    #
    if (!defined $spos) {
        msg("First Time Run");
        $spos      = 0; # position into the log file
        $numv     = 0; # number of virus types seen
        $lastupdate = 0; # number of virus types seen
    }

    #
    # If -r <file> on command line start at beginning of file
    #
    if ($opt{r}) {
        $spos = 0;
    }

    if ($debug) {
        dbg("start position: $spos numv: $numv lastupdate: $lastupdate");
        my $tmp = "left over spam session ids: ";
        foreach my $sid (keys %spamsess) {
            $tmp = "$tmp $sid";
        }
        dbg($tmp);
        while ( my ($id,$count) = each (%occurence)) {
            my $name = $vnames{$id};
            dbg("#$id: $name, seen $count times");
        }
    }
}


sub saveState {

    #
    # Reset the value of spos and save it for the next time we are called
    #
    dbg("saveState(): eof: $eof numv: $numv lastupdate: $lastupdate");

    open (OUT, ">$statefile") or die "Could not write to $statefile";
    print OUT "pos: $pos\n";
    print OUT "numv: $numv\n";
    print OUT "lastupdate: $lastupdate\n";
    print OUT "LC_TIME: $locale\n";
    print OUT "spamsess: ";
    foreach my $sid (keys %spamsess) {
        print OUT "$sid ";
    }
    print OUT "\n";
    close OUT;

    open (NAMES, ">$namesfile") or die "Could not write to $namesfile";
    open (COUNT, ">$countfile") or die "Could not write to $countfile";
    open (SEEN, ">$seenfile") or die "Could not write to $seenfile";

    while (my ($id, $virus) = each (%vnames)) {
        print NAMES "$id $virus\n";
        print COUNT "$id $occurence{$id}\n";
        print SEEN "$id $firstseen{$id} $lastseen{$id}\n";
    }
    close NAMES;
    close COUNT;
    close SEEN;

}


#
# Take a virus/name, and return an ID. Create the ID if the virus doesn't
# already exist
#
sub getVid {
    my ($virus, $epoch) = @_;
    my $id = 0;

    if (!exists $rvid{$virus}) {
        $numv++;
        $id = $numv;

        msg("New id (#$id, $virus) seen at $epoch");

        $vnames{$id}    = $virus;
        $rvid{$virus}   = $id;
        $firstseen{$id} = $epoch;

    } else {
        $id = $rvid{$virus};
    }

    return $id;
}


#
# Increment by one the number of times we have seen this virus. Also
# record the time we last saw it.
#
sub upCount {
    my ($id, $epoch) = @_;

    if (!exists $occurence{$id}) {
        $occurence{$id} = 1;

    } else {
        $occurence{$id}++;
    }

    $lastseen{$id} = $epoch;
}



#
#
#
sub classify {
    my $id;
    my ($mon, $day, $time, $host, $prog, $sid) = split(/\s+/, $line);

    #
    # Check that the environment locale matches what is being written
    # by syslog
    #
    my $tmp = $months{"$mon"};
    if (!defined($tmp)) {
        do_exit(1, "Unknown month \"$mon\" (current locale is \"$locale\")");
    }
    
    $mon = $tmp;

    #
    # Generate a seconds-since-1970 epoch and formated date string
    #
    my ($hour,$min,$sec) = split (/:/, $time);
    $epoch = timelocal($sec, $min, $hour, $day, $mon, $year-1900);

    if ($epoch > time()) {
        # date is last actually last year
        $epoch = timelocal($sec, $min, $hour, $day, $mon, $year-1901);
    }
    if (!defined $lastepoch) {
        $lastepoch = $epoch - 1;
    }

    my $isodate = sprintf("%4u-%02u-%02u", $year, $mon+1, $day) .
                  " $hour:$min:$sec";
    dbg("line at $isodate epoch: $epoch");

    #
    # Update all rrds if we are more than $rrdstep seconds since the last
    # update
    #
    if ($lastupdate == 0) {
        $lastupdate = int($epoch / $rrdstep) * $rrdstep;
        dbg("First update: $lastupdate");
    }

    my $count = int(($epoch - $lastupdate) / $rrdstep);
    for (my $i = 1; $i <= $count; $i++) {
        $lastupdate = $lastupdate + $rrdstep;
        foreach $id (keys %vnames) {
            updateRRD($id, $lastupdate);
        }
    }

    #
    # If this line contains a SPAM tag of some sort don't classify it
    # now, but record the fact and move on to the next line
    #
    if ($line =~ /\sSPAM(-TAG)*,\s/) {
        dbg("SPAM: $epoch: $isodate $sid");
        $spamsess{$sid} = 1;
        return;
    }

    #
    # Save the stats according to the classification of the email
    #
    if ($line =~ /Passed/) {

        #
        # Update the overall passed emails statistics
        #
        if (defined $spamsess{$sid}) {
            dbg("Passed(SPAM) $epoch: $isodate");
            $id = getVid("Passed(SPAM)", $epoch);
        } else {
            dbg("Passed: $epoch: $isodate");
            $id = getVid("Passed", $epoch);
        }
        upCount($id, $epoch);

    } elsif ($line =~ /\sNot-Delivered,\s/) {

        #
        # Update the overall not delivered statistics
        #
        if (defined $spamsess{$sid}) {
            dbg("Not-Delivered(SPAM): $epoch: $isodate");
            $id = getVid("Not-Delivered(SPAM)", $epoch);
        } else {
            dbg("Not-Delivered: $epoch: $isodate");
            $id = getVid("Not-Delivered", $epoch);
        }
        upCount($id, $epoch);

    } elsif ($line =~ /\sBANNED\sname\/type\s/) {
        dbg("Banned $epoch: $isodate");

        #
        # Update the overall banned emails statistics
        #
        $id = getVid("Banned", $epoch);
        upCount($id, $epoch);

    } elsif ($line =~ /\sINFECTED\s+\((.*?[\(.*?\)]*)\)/   or # amavisd-new
             $line =~ /\sPossible virus.*->\s+'(.*?)'/     or # amavis-ng
             $line =~ /.*parts\/\d+:\s+(.*?)\s+FOUND/      or # amavis-ng 
             $line =~ /\squarantine[:|d;].*?virus='(.*?)'/ or # amavisd
             $line =~ /.*part-\d+:\s+(.*?)\s+FOUND/ ) {         # clamav
        my $viruses = $1;
        dbg("viruses: \"$viruses\" at $epoch: $isodate");

        #
        # Update the overall infected emails statistics
        #
        $id = getVid("Infected", $epoch);
        upCount($id, $epoch);

        #
        # What is this specific nasty little bugger(s) called?
        # Update his statistics as well.
        #
        my @list = split(/,+\s+/, $viruses);
        my %seen;
        foreach my $virus (@list) {
            if (!$seen{$virus}) {
                $id = getVid($virus, $epoch);
                upCount($id, $epoch);
                $seen{$virus} = 1;
            }
        }
    }

    #
    # Since we have classified this session-id we need to remove it
    # from the %spamsess hash
    delete $spamsess{$sid};
}




#
#
#
sub parseFile {    

    my ($fname, $start, $stop) = @_;
    dbg("parseFile ($fname, $start, $stop)");

    #
    # Open up the file we need to parse
    #
    unless (open (LOGFILE, $fname)) {
        do_exit(1, "Could not open file $fname: $!"); 
    }
    unless (seek (LOGFILE, $start, 0)) {
        do_exit(1, "Could not seek to $start in file $fname: $!"); 
    }

    #
    # Loop each line until the current end of file
    #
    $pos = $start;
    my $lineid = 0;
    while ($pos < $stop and $line = <LOGFILE>) 
    {
        $lineid++;
        $lastepoch = $epoch;

        if ($line =~ /amavis.*?\[\d+\]:/) {
            classify();
        }
        #
        # Where did we get to in the file?
        #
        $pos = tell(LOGFILE);

        #
        # Save the current statistics every 1000 lines. This way
        # if the program dies we don't have to start again from the 
        # beginning each time. Also good for monitoring the graphs
        # to see where we are up to.
        #
        if (!($lineid % 1000)) {
            saveState();;
        }

    }
    close(LOGFILE);    

}


#
# Find the previous (rotated) log file and parse that according to
# our last position
#
sub parseRotFile { 
    my ($logfile, $spos) = @_;

    my $now = time();
    my $today     = localtime($now);
    my $yesterday = localtime($now - 60*60*24);

    $today = sprintf("%4u%02u%02u", $today->year + 1900,
                                    $today->mon + 1,
                                    $today->mday);

    $yesterday = sprintf("%4u%02u%02u", $yesterday->year + 1900,
                                        $yesterday->mon + 1,
                                        $yesterday->mday);

    my $rotlogfile = undef;

    if (-f "$logfile.0") {
        $rotlogfile = $logfile . ".0";
    } elsif (-f "$logfile.1") {
        $rotlogfile = $logfile . ".1";
    } elsif (-f "$logfile.01") {
        $rotlogfile = $logfile . ".01";
    } elsif (-f "$logfile-$today") {
        $rotlogfile = $logfile . "-$today";
    } elsif (-f "$logfile-$yesterday") {
        $rotlogfile = $logfile . "-$yesterday";
    }

    if (defined($rotlogfile)) {
        parseFile ($rotlogfile, $spos, (stat $rotlogfile)[7]);
    } else {
        err("Could not open rotated logfile.");
        err("  Tried extentions .0, .1, .01, -$today, -$yesterday");
        do_exit(1);
    }
}


sub createRRD {
    my ($file, $epoch) = @_;
    dbg("createRRD: $file, $epoch");

    RRDs::create($file,
                "--start", ($epoch - 1),
                "--step", $rrdstep,
                "DS:hits:COUNTER:".$rrdstep.":0:U",
                "RRA:AVERAGE:0.5:1:300",
                "RRA:AVERAGE:0.5:6:700",
                "RRA:AVERAGE:0.5:24:775",
                "RRA:AVERAGE:0.5:288:797",
                "RRA:MAX:0.5:1:300",
                "RRA:MAX:0.5:6:700",
                "RRA:MAX:0.5:24:775",
                "RRA:MAX:0.5:288:797"
                );

    my $err = RRDs::error;
    if ($err) {
        err("createRRD: $err");
        return -1;
    }

    return 1;
}


sub updateRRD () {
    my ($id, $epoch) = @_;
    my $count        = $occurence{$id};
    my $rrdfile      = "$statedir/$id.rrd";
    my $err;
    my $last;

    if (! -f $rrdfile) {
        if (! createRRD($rrdfile, $epoch - $rrdstep)) {
            do_exit(1, "updateRRD: Can't create file $rrdfile: $!");
        }
        my $upd = ($epoch - $rrdstep) . ":0";
        dbg("Update: $rrdfile at ", $epoch - $rrdstep, " count 0");
        RRDs::update($rrdfile, $upd);
    }

    dbg("Update: $rrdfile at $epoch count $count");

    $last = RRDs::last($rrdfile);
    $err = RRDs::error;
    if ($err) {
        err("updateRRD: $err");
        return -1;
    }

    #
    # We sometimes get two hits in the same second. Check for that here
    # and basically ignore it.
    #
    if ($epoch > $last) {
        my $upd = $epoch . ":" . $count;
        RRDs::update($rrdfile, $upd);

        $err = RRDs::error;
        if ($err) {
            err("updateRRD: $err");
            err("Attempted to update $rrdfile at $epoch count $count");
            return -1;
        }
    }

    return 1;
}



sub dbg {
    print "$me: @_\n" if ($debug);
}

sub msg {
    print "$me: @_\n" if ($verbose);
}

sub err {
    print STDERR "$me: error: @_\n";
}


# ########################################################################
# main() program
# ########################################################################

init();
semlock();
loadState();

$eof = (stat $logfile)[7];

if ($eof < $spos) {
    #
    # The log file has rotated under us, so do the rotated logfile first.
    #
    msg("Logfile \"$logfile\" appears to have rotated");
    parseRotFile($logfile, $spos);
    $spos = 0; # reset to the start of the file
}

parseFile ($logfile, $spos, $eof);
saveState();
semunlock();

