#!/usr/bin/env python

"""Summarize the contents of a syslog log file.

The syslog(3) service writes system log messages in a certain format:

	Jan 17 19:21:50 zeus kernel: klogd 1.3-3, log source = /proc/kmsg started.

This program summarizes the contents of such a file, by displaying each
unique (except for the time) line once, and also the number of times such
a line occurs in the input. The lines are displayed in the order they occur
in the input.

Lars Wirzenius <liw@iki.fi>"""

IGNORE_FILENAME = "/etc/syslog-summary/ignore"
STATE_FILENAME = None
REPEAT = 0
QUIET = 0

import sys, re, getopt, string, md5

datepats = [
	re.compile(r"^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 0-9][0-9] [ 0-9][0-9]:[0-9][0-9]:[0-9][0-9] "),
	re.compile(r"^(Mon|Tue|Wed|Thu|Fri|Sat|Sun) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 0-9][0-9][0-9][0-9]:[0-9][0-9] "),
	re.compile(r"^(Mon|Tue|Wed|Thu|Fri|Sat|Sun) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [ 0-9][0-9][0-9][0-9]:[0-9][0-9]:[0-9][0-9] "),
]
pidpat = re.compile(r"^([^ ]* [^ ]*)\[[0-9][0-9]*\]: ")
repeatpat = re.compile(r"^[^ ]* last message repeated (\d+) times$")

ignore_pats = []

def read_patterns(filename):
	pats = []
	try:
		f = open(filename, "r")
	except IOError:
		return []
	for line in f.xreadlines():
		if line[-1:] == "\n":
			line = line[:-1]
		pats.append(re.compile(line))
	f.close()
	return pats

def read_states(filename):
	states = {}
	if not filename:
		return states
	try:
		f = open(filename, "r")
	except IOError:
		return states
	for line in f.xreadlines():
		fields = string.split(line)
		states[fields[0]] = (string.atoi(fields[1]), fields[2])
	f.close()
	return states

def save_states(filename, states):
	if not filename:
		return
	try:
		f = open(filename, "w")
	except IOError:
		return
	for filename in states.keys():
		value = states[filename]
		f.write("%s %d %s\n" % (filename, value[0], value[1]))
	f.close()

def should_be_ignored(line):
	for pat in ignore_pats:
		if pat.search(line):
			return 1
	return 0

def printable_md5(str):
	chars = []
	for char in str:
		chars.append("%02x" % (ord(char)))
	return string.join(chars, "")

def split_date(line):
	for pat in datepats:
		m = pat.match(line)
		if m:
			return line[:m.end()], line[m.end():]
	print "line has bad date", "<" + string.rstrip(line) + ">"
	return None, line

def summarize(filename, states):
	counts = {}
	order = []
	ignored_count = 0
	if not QUIET:
		print "Summarizing %s" % filename
	file = open(filename, "r")
	linecount = 0

	md5obj = md5.new()
	if states.has_key(filename):
		oldlines, oldmd5 = states[filename]
		for i in xrange(oldlines):
			line = file.readline()
			md5obj.update(line)
		if printable_md5(md5obj.digest()) != oldmd5:
			file.seek(0, 0)
			md5obj = md5.new()
		else:
			linecount = oldlines
	if not QUIET:
		print "%8d Lines skipped (already processed)" % linecount

	line = file.readline()
	previous = None
	while line:
		md5obj.update(line)
		linecount = linecount + 1
		date, rest = split_date(line)
		if date:
			found = pidpat.search(rest)
			if found:
				rest = found.group(1)+": "+rest[found.end():]

		count = 1
		repeated = None
		if REPEAT:
			repeated=repeatpat.search(rest)
		if repeated and previous:
			count = int(repeated.group(1))
			rest = previous

		if should_be_ignored(rest):
			ignored_count = ignored_count + count
		else:
			if counts.has_key(rest):
				counts[rest] = counts[rest] + count
			else:
				assert count==1
				counts[rest] = count
				order.append(rest)

		if not repeated:
			previous = rest
		line = file.readline()
	file.close()
	md5new = printable_md5(md5obj.digest())
	states[filename] = (linecount, md5new)
	if QUIET and order:
		print "Summarizing %s" % filename
	if not QUIET or order:
		print "%8d Patterns to ignore" % len(ignore_pats)
		print "%8d Ignored lines" % ignored_count
	for rest in order:
		print "%8d %s" % (counts[rest], rest),
	if not QUIET or order:
		print

def main():
	global ignore_pats, IGNORE_FILENAME, STATE_FILENAME, REPEAT, QUIET

	opts, args = getopt.getopt(sys.argv[1:], "i:qs:r", [
		"ignore=", "quiet", "state=", "repeat" ])

	for opt, optarg in opts:
		if opt == "-i" or opt == "--ignore":
			IGNORE_FILENAME = optarg
		elif opt == "-s" or opt == "--state":
			STATE_FILENAME = optarg
		elif opt == "-r" or opt == "--repeat":
			REPEAT = 1
		elif opt == "-q" or opt == "--quiet":
			QUIET = 1

	ignore_pats = read_patterns(IGNORE_FILENAME)
	states = read_states(STATE_FILENAME)
	for filename in args:
		summarize(filename, states)
	save_states(STATE_FILENAME, states)

if __name__ == "__main__":
	main()
