From a063a69773e087b140a8c4886537afe19ea2150f Mon Sep 17 00:00:00 2001 From: David Rousselie Date: Wed, 1 Feb 2006 12:08:43 +0100 Subject: [PATCH] Mail charset error support - Try multiple charset when no charset is specified by "Content-Type" field : begins with sys.defaultencoding utf8 then "iso-8859-1", config file encoding and finally encoding found in previous fields "Subject" or "From". darcs-hash:20060201110843-86b55-7fd556a8ffadd9a5fed2b3317b17bfcca9d1dc58.gz --- jabber/mailconnection.py | 57 +++++++++++++++++++++++++++++++----- jmc.py | 2 ++ jmc.xml | 1 + run_test.py | 6 ++-- tests/email_generator.py | 3 ++ tests/jmc-test.xml | 1 + tests/test_mailconnection.py | 7 +++-- 7 files changed, 64 insertions(+), 13 deletions(-) diff --git a/jabber/mailconnection.py b/jabber/mailconnection.py index bc71ba9..c93bc0b 100644 --- a/jabber/mailconnection.py +++ b/jabber/mailconnection.py @@ -37,6 +37,8 @@ POP3_TIMEOUT = 10 DO_NOTHING = 0 DIGEST = 1 RETRIEVE = 2 +default_encoding = "iso-8859-1" + ## All MY* classes are implemented to add a timeout (settimeout) ## while connecting class MYIMAP4(imaplib.IMAP4): @@ -192,41 +194,82 @@ class MailConnection(object): + str(self.online_action) + "#" + str(self.away_action) + "#" + \ str(self.xa_action) + "#" + str(self.dnd_action) + "#" + str(self.offline_action) + "#" + str(self.interval) + "#" + str(self.live_email_only) - def get_decoded_part(self, part): + def get_decoded_part(self, part, charset_hint): content_charset = part.get_content_charset() if content_charset: - return part.get_payload(decode=True).decode(content_charset) + return unicode(part.get_payload(decode=True).decode(content_charset)) else: - return part.get_payload(decode=True) + result = "" + try: + result = unicode(part.get_payload(decode=True)) + except Exception, e: + try: + result = unicode(part.get_payload(decode=True).decode("iso-8859-1")) + except Exception, e: + try: + result = unicode(part.get_payload(decode=True).decode(default_encoding)) + except Exception, e: + if charset_hint is not None: + try: + result = unicode(part.get_payload(decode=True).decode(charset_hint)) + except Exception, e: + print e + return result def format_message(self, email_msg, include_body = True): from_decoded = email.Header.decode_header(email_msg["From"]) + charset_hint = None result = u"From : " for i in range(len(from_decoded)): if from_decoded[i][1]: + charset_hint = from_decoded[i][1] result += unicode(from_decoded[i][0].decode(from_decoded[i][1])) else: - result += unicode(from_decoded[i][0]) + try: + result += unicode(from_decoded[i][0]) + except Exception,e: + try: + result += unicode(from_decoded[i][0].decode("iso-8859-1")) + except Exception, e: + try: + result += unicode(from_decoded[i][0].decode(default_encoding)) + except Exception, e: + print e result += "\n" subject_decoded = email.Header.decode_header(email_msg["Subject"]) result += u"Subject : " for i in range(len(subject_decoded)): if subject_decoded[i][1]: + charset_hint = subject_decoded[i][1] result += unicode(subject_decoded[i][0].decode(subject_decoded[i][1])) else: - result += unicode(subject_decoded[i][0]) + try: + result += unicode(subject_decoded[i][0]) + except Exception,e: + try: + result += unicode(subject_decoded[i][0].decode("iso-8859-1")) + except Exception, e: + try: + result += unicode(subject_decoded[i][0].decode(default_encoding)) + except Exception, e: + if charset_hint is not None: + try: + result += unicode(subject_decoded[i][0].decode(charset_hint)) + except Exception, e: + print e + result += u"\n\n" if include_body: action = { - "text/plain" : lambda part: self.get_decoded_part(part), + "text/plain" : lambda part: self.get_decoded_part(part, charset_hint), "text/html" : lambda part: "\n<<>>\n" } for part in email_msg.walk(): content_type = part.get_content_type() if action.has_key(content_type): - result += unicode(action[content_type](part)) + u'\n' + result += action[content_type](part) + u'\n' return result def format_message_summary(self, email_msg): diff --git a/jmc.py b/jmc.py index ea9bd6f..3346daf 100755 --- a/jmc.py +++ b/jmc.py @@ -26,6 +26,7 @@ import sys import os.path import logging +from jabber import mailconnection from jabber.component import MailComponent, ComponentFatalError from jabber.config import Config @@ -46,6 +47,7 @@ def main(config_file = "jmc.xml", isDebug = 0): str(sys.exc_value) sys.exit(1) + mailconnection.default_encoding = config.get_content("config/mail_default_encoding") print "creating component..." mailcomp = MailComponent(config) diff --git a/jmc.xml b/jmc.xml index ed3c278..7a3f58d 100644 --- a/jmc.xml +++ b/jmc.xml @@ -16,4 +16,5 @@ /var/spool/jabber 5 + iso-8859-1 diff --git a/run_test.py b/run_test.py index 19e1486..163f26f 100644 --- a/run_test.py +++ b/run_test.py @@ -70,9 +70,9 @@ if __name__ == '__main__': storage_suite, \ dbmstorage_suite, \ sqlitestorage_suite)) - # test_support.run_suite(mail_connection_suite) - # test_support.run_suite(pop3_connection_suite) - # test_support.run_suite(imap_connection_suite) + #test_support.run_suite(mail_connection_suite) + #test_support.run_suite(pop3_connection_suite) + #test_support.run_suite(imap_connection_suite) #test_support.run_suite(mc_factory_suite) #test_support.run_suite(component_suite) #test_support.run_suite(component2_suite) diff --git a/tests/email_generator.py b/tests/email_generator.py index 3312bb9..0af84dc 100644 --- a/tests/email_generator.py +++ b/tests/email_generator.py @@ -32,6 +32,9 @@ def _create_multipart(encoded): part2 = MIMEText("Encoded multipart2 with 'iso-8859-15' charset (éàê)", \ _charset = "iso-8859-15") msg.attach(part2) + part3 = MIMEText("Encoded multipart3 with no charset (éàê)", \ + _charset = "") + msg.attach(part3) else: part1 = MIMEText("Not encoded multipart1") msg.attach(part1) diff --git a/tests/jmc-test.xml b/tests/jmc-test.xml index f09380f..875c883 100644 --- a/tests/jmc-test.xml +++ b/tests/jmc-test.xml @@ -15,4 +15,5 @@ SQLite . 5 + iso-8859-15 diff --git a/tests/test_mailconnection.py b/tests/test_mailconnection.py index 49b0b8a..63d0e44 100644 --- a/tests/test_mailconnection.py +++ b/tests/test_mailconnection.py @@ -48,12 +48,12 @@ class MailConnection_TestCase(unittest.TestCase): test_get_decoded_part_not_encoded = \ make_test((False, False, False), \ - lambda self, email: self.connection.get_decoded_part(email), \ + lambda self, email: self.connection.get_decoded_part(email, None), \ u"Not encoded single part") test_get_decoded_part_encoded = \ make_test((True, False, False), \ - lambda self, email: self.connection.get_decoded_part(email), \ + lambda self, email: self.connection.get_decoded_part(email, None), \ u"Encoded single part with 'iso-8859-15' charset (éàê)") test_format_message_summary_not_encoded = \ @@ -104,7 +104,8 @@ class MailConnection_TestCase(unittest.TestCase): lambda self, email: self.connection.format_message(email), \ u"From : encoded from (éàê)\nSubject : encoded subject (éà" + \ u"ê)\n\nutf-8 multipart1 with no charset (éàê)" + \ - u"\nEncoded multipart2 with 'iso-8859-15' charset (éàê)\n") + u"\nEncoded multipart2 with 'iso-8859-15' charset (éàê)\n" + \ + u"Encoded multipart3 with no charset (éàê)\n") class POP3Connection_TestCase(unittest.TestCase):