#!/usr/bin/env python

"""
Split passed in file(s) into separate mail messages,
then re-order by date into single output file.

NB: the output file can be one of the input files;
input files that are links to ones already processed are ignored.
"""

Usage = """Usage: %s [--out <file>] [[--in] <file>] ...

	--debug		output debugging details at <level>
	--in		name of unordered mailbox
	--links		list linked files on <stdout>
	--match		restrict debugging and/or warning messages to just
			those that match <regexp>
	--out		name of ordered mailbox
	--verbose	show actions

	Default copies <stdin> to <stdout>.
"""
ShortOpts = 'd:i:lm:o:v?'
LongOpts = ['debug=', 'help', 'in=', 'links', 'match=', 'out=', 'verbose']


import getopt, md5, os, re, sys, time

from email.Parser import Parser
from email.Errors import MessageError
from email.Utils import mktime_tz, parsedate_tz


DebugLvl = 0
DebugMatch = None
ContLine = re.compile(r'\n\s+')
FileCache = {}
LastDate = mktime_tz(parsedate_tz('Sat, 1 Jan 2000 00:00:00 +1000'))
ListLinks = False
MessageCache = {}
Now = time.time()
UnixFrom = re.compile(r'^From \S+ .*(\n\s.*)*\n\S')
Verbose = False
MUA_Subject = "DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA"



def args():
	try:
		optlist, args = getopt.getopt(sys.argv[1:], ShortOpts, LongOpts)
	except getopt.error, val:
		usage(val)

	global DebugLvl
	global DebugMatch
	global ListLinks
	global Verbose

	inf, ouf = [], None

	for opt,val in optlist:
		if   opt in ('-d', '--debug'):
			DebugLvl = int(val)
			Verbose = True
		elif opt in ('-i', '--in'):
			inf += [val]
		elif opt in ('-l', '--links'):
			ListLinks = True
		elif opt in ('-m', '--match'):
			DebugMatch = re.compile(val)
		elif opt in ('-o', '--out'):
			ouf = val
		elif opt in ('-v', '--verbose'):
			Verbose = True
		else:
			usage()

	for arg in args:
		inf += [arg]

	if not inf: inf += [None]

	return inf, ouf



def print_links():

	for id,item in FileCache.items():
		names = item['names']
		if len(names) > 1:
			print ' '.join(names)



def process_file(name):

	Debug(2, '''"Reading file %s" % (name or '<stdin>')''')

	try:
		if name is None:
			name = '<stdin>'
			fd = sys.stdin
		else:
			fd = open(name)

		messages = []

		r = os.fstat(fd.fileno())
		id = (r.st_dev, r.st_ino)
		if id not in FileCache:
			FileCache[id] = {'times':(r.st_atime, r.st_mtime), 'names':[name]}

			n = 0
			for message in read_messages(fd):
				mesg = process_message(message, n, name)
				n += 1
				if mesg is None:
					continue
				messages.append(mesg)
		else:
			FileCache[id]['names'] += [name]
			Debug(2, '''"File %s is link to %s (already processed)" % (name, FileCache[id]['names'][0])''')

		fd.close()
	except (IOError, OSError), val:
		error('Unable to read "%s" - %s.' % (name, val))

	Debug(1, '''"File %s => %s messages" % (name, len(messages))''')

	return messages



def process_message(text, msgnum, name):

	global LastDate

	Debug(3, r'''"Message %s \"%s\"..." % (msgnum, text[:79].replace('\n', '\\n'))''')

	# Parser objects to continuation lines in unixfrom line

	unixfrom, text = splitunixfrom(text.rstrip())	# Restore trailing newlines later

	# As the mail server can muck with headers,
	# we need to identify messages by the body only,
	# so find the start of the body and use the body as the message signature.

	body = text[text.find('\n\n')+2:]
	tag = md5.new(body).digest()
	if tag in MessageCache:
		Debug(2, '''"Message %s discarded: MD5 sum in cache" % msgnum''')
		return None
	MessageCache[tag] = None
	Debug(3, '''"Message %s text size %s tag %s" % (msgnum, len(body), `tag`)''')

	try:
		msg = Parser().parsestr(text, headersonly=True)
	except MessageError, val:
		warn("File %s message %s parse error: %s" % (name, msgnum, str(val)))
		return LastDate, msgnum, unixfrom + text + '\n\n'

	subj = msg['Subject']
	Debug(3, r'''"Subject: %s" % subj''')
	if subj == MUA_Subject:
		Debug(2, '''"Message %s discarded: Subject: %s" % (msgnum, MUA_Subject)''')
		return None

	date = msg['Date']
	if not date:
		date = LastDate
	else:
		try:
			date = mktime_tz(parsedate_tz(date))
			if date >= Now:	# Ignore future
				date = LastDate
			else:
				LastDate = date
		except:
			warn("File %s message %s parse error for {Date: %s}" % (name, msgnum, msg['Date']))
			date = LastDate

	Debug(2, '''"Found message %s: date=%s" % (msgnum, date)''')

	return date, msgnum, unixfrom + text + '\n\n'



def read_messages(fd):

	data = []; app = data.append
	unixfrom = ''

	for line in fd:
		if unixfrom:
			if line[0].isspace() and line[0] != '\n':
				unixfrom += line
				continue
			if data and UnixFrom.match(unixfrom+line) is not None:
				yield ''.join(data)
				data[:] = [unixfrom]
				unixfrom = ''
			else:
				app(unixfrom)
				unixfrom = ''
		if line[:5] == 'From ':
			unixfrom = line
			continue
		app(line)

	if data:
		yield ''.join(data)



def sort_messages(files):

	messages = []

	for file in files:
		messages += process_file(file)

	if messages:
		messages.sort()

		global LastDate; LastDate = messages[-1][0]
		Debug(1, '''"last date = %r" % LastDate''')

	return [text for date,number,text in messages]



def splitunixfrom(text):

	mo = UnixFrom.match(text)
	if mo is None:
		return '', text

	start,stop = mo.span()
	stop -= 1	# UnixFrom matches one char from following line
	unixfrom, text = text[start:stop], text[stop:]

	return ' '.join(ContLine.split(unixfrom)), text



def Debug(lvl, str):

	if DebugLvl < lvl:
		return

	pad = ''

	#
	#	Delayed evaluation of debug() argument allowed
	#
	try:
		raise "get caller's frame"
	except:
		cf = sys.exc_info()[2].tb_frame.f_back
		try:
			pad = _frame_name(cf)
			if str:
				str = eval(str, cf.f_globals, cf.f_locals)
		except:
			if DebugLvl > 9:
				import traceback
				traceback.print_exc()
		del cf	# no circ. refs!

	warn("%-*s %s" % (35+lvl, pad, str))


def _frame_name(frm,  sep=os.sep):

	code = frm.f_code
	filename = code.co_filename
	filename = filename[filename.rfind(sep)+1:]	# `basename'
	self = frm.f_locals.get('self')
	if self is None:
		return '%s:%s' % (filename, code.co_name)
	return '%s:%s.%s' % (filename, self.__class__.__name__, code.co_name)


def error(reason):
	sys.stderr.write('%s\n' % reason)
	sys.exit(1)


def report(message):
	sys.stdout.write('%s\n' % message)
	sys.stdout.flush()


def usage(reason=''):
	sys.stdout.flush()
	if reason: sys.stderr.write('\t%s\n\n' % reason)
	head, tail = os.path.split(sys.argv[0])
	sys.stderr.write(Usage % tail)
	sys.stderr.write(__doc__)
	sys.exit(1)


def warn(msg):

	if DebugMatch is not None and DebugMatch.search(msg) is None:
		return

	sys.stdout.flush()
	sys.stderr.write('%s\n' % msg)
	sys.stderr.flush()



def main():

	in_files, out_file = args()

	messages = sort_messages(in_files)

	if not messages:
		if Verbose:
			report('No messages found in %r.' % in_files)
		sys.exit(0)

	try:
		Debug(1, '''"Writing %s messages to %s" % (len(messages), out_file or '<stdout>')''')
		if not out_file:
			fd = sys.stdout
		else:
			fd = open(out_file, "w")
		fd.write('\n\n'.join(messages))
		fd.close()
		if out_file and LastDate:
			os.utime(out_file, (LastDate, LastDate))
		# if out_file and FileCache:
		# 	# Find newest a/mtime in FileCache and set out_file to have same
		# 	matimes = [(item['times'][1], item['times']) for item in FileCache.values()]
		# 	matimes.sort()
		# 	os.utime(out_file, matimes[-1][1])
	except IOError, val:
		error('Could not write "%s": %s' % (out_file or '<stdout>', str(val)))

	if out_file and ListLinks:
		print_links()



if __name__ == '__main__':
	try:
		main()
	except KeyboardInterrupt:
		pass