import os import os.path import email import email.Parser import sys import time # Name of the file whose modification date records when # this program was last run last_run_marker_file = "spam/last_training_run" # Path to bogofilter bogofilter_path = "/usr/local/bin/bogofilter" # Inbox maildir inbox_maildir = "Maildir" # Spam maildir spam_maildir = "Maildir/.Spam" # Some global counts. # Number of mails processed processed = 0 # Number of spams (those in spam maildir) count_spam = 0 # Number of not spams (those in inbox maildir) count_not_spam = 0 # Number of spams that bogofilter thought weren't spam # and so had to be registered as spam (false negatives) registered_as_spam = 0 # Number of non spams that bogofilter thought were spams # and so had to be registered as normal (false positives) registered_as_not_spam = 0 # Holds the log file log_file = None # Log a message def log_message(text, msg_headers): s = "%s Subject \"%s\" From \"%s\"\n" % (text, msg_headers.get("Subject"), msg_headers.get("From")) log_file.write(s) # Count mail message as spam def count_as_spam(mail_file, msg_headers): global count_spam count_spam += 1 # Count mail message as not spam def count_as_not_spam(mail_file, msg_headers): global count_not_spam count_not_spam += 1 # Register the contents of the mail_file supplied as not spam def register_as_not_spam(mail_file, msg_headers): rc = os.spawnl(os.P_WAIT, bogofilter_path, "bogofilter", "-n", "-I", mail_file) if (rc != 0): print "Error processing file %s - %d" % (mail_file, rc) sys.exit(1) global registered_as_not_spam registered_as_not_spam += 1 log_message("Message In Inbox. bogofilter thought spam", msg_headers) # Register the contents of the mail_file supplied as spam def register_as_spam(mail_file, msg_headers): rc = os.spawnl(os.P_WAIT, bogofilter_path, "bogofilter", "-s", "-I", mail_file) if (rc != 0): print "Error processing file %s - %d" % (mail_file, rc) sys.exit(1) global registered_as_spam registered_as_spam += 1 log_message("Message In Spam. bogofilter thought not spam", msg_headers) # Process the mail in maildir that has been read. Only files # created since last_run_time are processed. The functions in spam_fcns # are is called for those mails bogofilter thought were spam, those # in not_spam_fcns for those it thought weren't spam def process_maildir(last_run_time, maildir, spam_fcns, not_spam_fcns): # Remember where we are original_dir = os.getcwd() # Change to correct dir os.chdir("%s/cur" % maildir) # A parser to parse the email files parser = email.Parser.Parser() # Get list of all files files = os.listdir(".") # Drop dot files and directories for mail_file in filter(os.path.isfile, filter(lambda s: s[0] != ".", files)): # Only process those files created after the last run file_create_time = os.path.getctime(mail_file) if (file_create_time <= last_run_time): continue # Don't look at those marked with a T (for trashed) in # the info field. When mailfolder is "compacted" these # messages will go away info_index = mail_file.rfind(":2,") if (info_index == -1 or mail_file.find("T", info_index + 3) > -1): continue # We only need the headers so only parse them msg_headers = parser.parse(file(mail_file), True) # Only process those marked by bogofilter if (not msg_headers.has_key("X-Bogosity")): continue is_spam = msg_headers.get("X-Bogosity").split(",")[0] global processed processed += 1 # Call the appropriate functions if (is_spam == "Yes"): for fcn in spam_fcns: fcn(mail_file, msg_headers) else: for fcn in not_spam_fcns: fcn(mail_file, msg_headers) # Back to the original dir os.chdir(original_dir) # Process the inbox def process_inbox(last_run_time): process_maildir(last_run_time, inbox_maildir, [register_as_not_spam, count_as_not_spam], [count_as_not_spam]) # Process the spam folder def process_spam(last_run_time): process_maildir(last_run_time, spam_maildir, [count_as_spam], [register_as_spam, count_as_spam]) # Return the time (number of secs since epoch) the # program was last run def get_last_run_time(): when = 0 if (os.path.exists(last_run_marker_file)): when = os.path.getmtime(last_run_marker_file) return when # Update the record of when the program was last run def update_last_run_time(): if (not os.path.exists(last_run_marker_file)): timing_file = file(last_run_marker_file, 'w') timing_file.write("# Marker file. Ignore contents") timing_file.close() os.utime(last_run_marker_file, None) # Initialize the program by getting the time it was last run # and writing an initial message def init(): last_run_time = get_last_run_time() global log_file log_file = file("spam/log.txt", "a") s = "Starting run. Last run %s\n" % time.ctime(last_run_time) print s, log_file.write(s) return last_run_time # End the program by updating the time it was last run # and writing a final message def term(): update_last_run_time() s = "Processed %d messages. %d spam and %d not spam. Registered %d as spam and %d as not spam.\n" % (processed, count_spam, count_not_spam, registered_as_spam, registered_as_not_spam) print s, log_file.write(s) log_file.close() # The main function def main(): last_run_time = init() process_spam(last_run_time) process_inbox(last_run_time) term() if __name__ == "__main__": # Need to set library path for my install of BerkeleyDB used by # bogofilter os.environ['LD_LIBRARY_PATH'] = "/usr/local/BerkeleyDB.4.0/lib" main()