#!/usr/bin/env python """ Split passed in file(s) into separate mail messages, then re-order by date into single output file. NB: the output file can be one of the input files; input files that are links to ones already processed are ignored. """ Usage = """Usage: %s [--out ] [[--in] ] ... --debug output debugging details at --in name of unordered mailbox --links list linked files on --match restrict debugging and/or warning messages to just those that match --out name of ordered mailbox --verbose show actions Default copies to . """ ShortOpts = 'd:i:lm:o:v?' LongOpts = ['debug=', 'help', 'in=', 'links', 'match=', 'out=', 'verbose'] import getopt, md5, os, re, sys, time from email.Parser import Parser from email.Errors import MessageError from email.Utils import mktime_tz, parsedate_tz DebugLvl = 0 DebugMatch = None ContLine = re.compile(r'\n\s+') FileCache = {} LastDate = mktime_tz(parsedate_tz('Sat, 1 Jan 2000 00:00:00 +1000')) ListLinks = False MessageCache = {} Now = time.time() UnixFrom = re.compile(r'^From \S+ .*(\n\s.*)*\n\S') Verbose = False MUA_Subject = "DON'T DELETE THIS MESSAGE -- FOLDER INTERNAL DATA" def args(): try: optlist, args = getopt.getopt(sys.argv[1:], ShortOpts, LongOpts) except getopt.error, val: usage(val) global DebugLvl global DebugMatch global ListLinks global Verbose inf, ouf = [], None for opt,val in optlist: if opt in ('-d', '--debug'): DebugLvl = int(val) Verbose = True elif opt in ('-i', '--in'): inf += [val] elif opt in ('-l', '--links'): ListLinks = True elif opt in ('-m', '--match'): DebugMatch = re.compile(val) elif opt in ('-o', '--out'): ouf = val elif opt in ('-v', '--verbose'): Verbose = True else: usage() for arg in args: inf += [arg] if not inf: inf += [None] return inf, ouf def print_links(): for id,item in FileCache.items(): names = item['names'] if len(names) > 1: print ' '.join(names) def process_file(name): Debug(2, '''"Reading file %s" % (name or '')''') try: if name is None: name = '' fd = sys.stdin else: fd = open(name) messages = [] r = os.fstat(fd.fileno()) id = (r.st_dev, r.st_ino) if id not in FileCache: FileCache[id] = {'times':(r.st_atime, r.st_mtime), 'names':[name]} n = 0 for message in read_messages(fd): mesg = process_message(message, n, name) n += 1 if mesg is None: continue messages.append(mesg) else: FileCache[id]['names'] += [name] Debug(2, '''"File %s is link to %s (already processed)" % (name, FileCache[id]['names'][0])''') fd.close() except (IOError, OSError), val: error('Unable to read "%s" - %s.' % (name, val)) Debug(1, '''"File %s => %s messages" % (name, len(messages))''') return messages def process_message(text, msgnum, name): global LastDate Debug(3, r'''"Message %s \"%s\"..." % (msgnum, text[:79].replace('\n', '\\n'))''') # Parser objects to continuation lines in unixfrom line unixfrom, text = splitunixfrom(text.rstrip()) # Restore trailing newlines later # As the mail server can muck with headers, # we need to identify messages by the body only, # so find the start of the body and use the body as the message signature. body = text[text.find('\n\n')+2:] tag = md5.new(body).digest() if tag in MessageCache: Debug(2, '''"Message %s discarded: MD5 sum in cache" % msgnum''') return None MessageCache[tag] = None Debug(3, '''"Message %s text size %s tag %s" % (msgnum, len(body), `tag`)''') try: msg = Parser().parsestr(text, headersonly=True) except MessageError, val: warn("File %s message %s parse error: %s" % (name, msgnum, str(val))) return LastDate, msgnum, unixfrom + text + '\n\n' subj = msg['Subject'] Debug(3, r'''"Subject: %s" % subj''') if subj == MUA_Subject: Debug(2, '''"Message %s discarded: Subject: %s" % (msgnum, MUA_Subject)''') return None date = msg['Date'] if not date: date = LastDate else: try: date = mktime_tz(parsedate_tz(date)) if date >= Now: # Ignore future date = LastDate else: LastDate = date except: warn("File %s message %s parse error for {Date: %s}" % (name, msgnum, msg['Date'])) date = LastDate Debug(2, '''"Found message %s: date=%s" % (msgnum, date)''') return date, msgnum, unixfrom + text + '\n\n' def read_messages(fd): data = []; app = data.append unixfrom = '' for line in fd: if unixfrom: if line[0].isspace() and line[0] != '\n': unixfrom += line continue if data and UnixFrom.match(unixfrom+line) is not None: yield ''.join(data) data[:] = [unixfrom] unixfrom = '' else: app(unixfrom) unixfrom = '' if line[:5] == 'From ': unixfrom = line continue app(line) if data: yield ''.join(data) def sort_messages(files): messages = [] for file in files: messages += process_file(file) if messages: messages.sort() global LastDate; LastDate = messages[-1][0] Debug(1, '''"last date = %r" % LastDate''') return [text for date,number,text in messages] def splitunixfrom(text): mo = UnixFrom.match(text) if mo is None: return '', text start,stop = mo.span() stop -= 1 # UnixFrom matches one char from following line unixfrom, text = text[start:stop], text[stop:] return ' '.join(ContLine.split(unixfrom)), text def Debug(lvl, str): if DebugLvl < lvl: return pad = '' # # Delayed evaluation of debug() argument allowed # try: raise "get caller's frame" except: cf = sys.exc_info()[2].tb_frame.f_back try: pad = _frame_name(cf) if str: str = eval(str, cf.f_globals, cf.f_locals) except: if DebugLvl > 9: import traceback traceback.print_exc() del cf # no circ. refs! warn("%-*s %s" % (35+lvl, pad, str)) def _frame_name(frm, sep=os.sep): code = frm.f_code filename = code.co_filename filename = filename[filename.rfind(sep)+1:] # `basename' self = frm.f_locals.get('self') if self is None: return '%s:%s' % (filename, code.co_name) return '%s:%s.%s' % (filename, self.__class__.__name__, code.co_name) def error(reason): sys.stderr.write('%s\n' % reason) sys.exit(1) def report(message): sys.stdout.write('%s\n' % message) sys.stdout.flush() def usage(reason=''): sys.stdout.flush() if reason: sys.stderr.write('\t%s\n\n' % reason) head, tail = os.path.split(sys.argv[0]) sys.stderr.write(Usage % tail) sys.stderr.write(__doc__) sys.exit(1) def warn(msg): if DebugMatch is not None and DebugMatch.search(msg) is None: return sys.stdout.flush() sys.stderr.write('%s\n' % msg) sys.stderr.flush() def main(): in_files, out_file = args() messages = sort_messages(in_files) if not messages: if Verbose: report('No messages found in %r.' % in_files) sys.exit(0) try: Debug(1, '''"Writing %s messages to %s" % (len(messages), out_file or '')''') if not out_file: fd = sys.stdout else: fd = open(out_file, "w") fd.write('\n\n'.join(messages)) fd.close() if out_file and LastDate: os.utime(out_file, (LastDate, LastDate)) # if out_file and FileCache: # # Find newest a/mtime in FileCache and set out_file to have same # matimes = [(item['times'][1], item['times']) for item in FileCache.values()] # matimes.sort() # os.utime(out_file, matimes[-1][1]) except IOError, val: error('Could not write "%s": %s' % (out_file or '', str(val))) if out_file and ListLinks: print_links() if __name__ == '__main__': try: main() except KeyboardInterrupt: pass