Mail charset error support

- Try multiple charset when no charset is specified by "Content-Type" field :
begins with sys.defaultencoding utf8
then "iso-8859-1", config file encoding and finally encoding found in previous
fields "Subject" or "From".

darcs-hash:20060201110843-86b55-7fd556a8ffadd9a5fed2b3317b17bfcca9d1dc58.gz
This commit is contained in:
David Rousselie
2006-02-01 12:08:43 +01:00
parent 4a5848a66c
commit a063a69773
7 changed files with 64 additions and 13 deletions

View File

@@ -37,6 +37,8 @@ POP3_TIMEOUT = 10
DO_NOTHING = 0 DO_NOTHING = 0
DIGEST = 1 DIGEST = 1
RETRIEVE = 2 RETRIEVE = 2
default_encoding = "iso-8859-1"
## All MY* classes are implemented to add a timeout (settimeout) ## All MY* classes are implemented to add a timeout (settimeout)
## while connecting ## while connecting
class MYIMAP4(imaplib.IMAP4): class MYIMAP4(imaplib.IMAP4):
@@ -192,41 +194,82 @@ class MailConnection(object):
+ str(self.online_action) + "#" + str(self.away_action) + "#" + \ + str(self.online_action) + "#" + str(self.away_action) + "#" + \
str(self.xa_action) + "#" + str(self.dnd_action) + "#" + str(self.offline_action) + "#" + str(self.interval) + "#" + str(self.live_email_only) str(self.xa_action) + "#" + str(self.dnd_action) + "#" + str(self.offline_action) + "#" + str(self.interval) + "#" + str(self.live_email_only)
def get_decoded_part(self, part): def get_decoded_part(self, part, charset_hint):
content_charset = part.get_content_charset() content_charset = part.get_content_charset()
if content_charset: if content_charset:
return part.get_payload(decode=True).decode(content_charset) return unicode(part.get_payload(decode=True).decode(content_charset))
else: else:
return part.get_payload(decode=True) result = ""
try:
result = unicode(part.get_payload(decode=True))
except Exception, e:
try:
result = unicode(part.get_payload(decode=True).decode("iso-8859-1"))
except Exception, e:
try:
result = unicode(part.get_payload(decode=True).decode(default_encoding))
except Exception, e:
if charset_hint is not None:
try:
result = unicode(part.get_payload(decode=True).decode(charset_hint))
except Exception, e:
print e
return result
def format_message(self, email_msg, include_body = True): def format_message(self, email_msg, include_body = True):
from_decoded = email.Header.decode_header(email_msg["From"]) from_decoded = email.Header.decode_header(email_msg["From"])
charset_hint = None
result = u"From : " result = u"From : "
for i in range(len(from_decoded)): for i in range(len(from_decoded)):
if from_decoded[i][1]: if from_decoded[i][1]:
charset_hint = from_decoded[i][1]
result += unicode(from_decoded[i][0].decode(from_decoded[i][1])) result += unicode(from_decoded[i][0].decode(from_decoded[i][1]))
else: else:
result += unicode(from_decoded[i][0]) try:
result += unicode(from_decoded[i][0])
except Exception,e:
try:
result += unicode(from_decoded[i][0].decode("iso-8859-1"))
except Exception, e:
try:
result += unicode(from_decoded[i][0].decode(default_encoding))
except Exception, e:
print e
result += "\n" result += "\n"
subject_decoded = email.Header.decode_header(email_msg["Subject"]) subject_decoded = email.Header.decode_header(email_msg["Subject"])
result += u"Subject : " result += u"Subject : "
for i in range(len(subject_decoded)): for i in range(len(subject_decoded)):
if subject_decoded[i][1]: if subject_decoded[i][1]:
charset_hint = subject_decoded[i][1]
result += unicode(subject_decoded[i][0].decode(subject_decoded[i][1])) result += unicode(subject_decoded[i][0].decode(subject_decoded[i][1]))
else: else:
result += unicode(subject_decoded[i][0]) try:
result += unicode(subject_decoded[i][0])
except Exception,e:
try:
result += unicode(subject_decoded[i][0].decode("iso-8859-1"))
except Exception, e:
try:
result += unicode(subject_decoded[i][0].decode(default_encoding))
except Exception, e:
if charset_hint is not None:
try:
result += unicode(subject_decoded[i][0].decode(charset_hint))
except Exception, e:
print e
result += u"\n\n" result += u"\n\n"
if include_body: if include_body:
action = { action = {
"text/plain" : lambda part: self.get_decoded_part(part), "text/plain" : lambda part: self.get_decoded_part(part, charset_hint),
"text/html" : lambda part: "\n<<<HTML part skipped>>>\n" "text/html" : lambda part: "\n<<<HTML part skipped>>>\n"
} }
for part in email_msg.walk(): for part in email_msg.walk():
content_type = part.get_content_type() content_type = part.get_content_type()
if action.has_key(content_type): if action.has_key(content_type):
result += unicode(action[content_type](part)) + u'\n' result += action[content_type](part) + u'\n'
return result return result
def format_message_summary(self, email_msg): def format_message_summary(self, email_msg):

2
jmc.py
View File

@@ -26,6 +26,7 @@ import sys
import os.path import os.path
import logging import logging
from jabber import mailconnection
from jabber.component import MailComponent, ComponentFatalError from jabber.component import MailComponent, ComponentFatalError
from jabber.config import Config from jabber.config import Config
@@ -46,6 +47,7 @@ def main(config_file = "jmc.xml", isDebug = 0):
str(sys.exc_value) str(sys.exc_value)
sys.exit(1) sys.exit(1)
mailconnection.default_encoding = config.get_content("config/mail_default_encoding")
print "creating component..." print "creating component..."
mailcomp = MailComponent(config) mailcomp = MailComponent(config)

View File

@@ -16,4 +16,5 @@
<spooldir>/var/spool/jabber</spooldir> <spooldir>/var/spool/jabber</spooldir>
<!-- default check interval in minutes --> <!-- default check interval in minutes -->
<check_interval>5</check_interval> <check_interval>5</check_interval>
<mail_default_encoding>iso-8859-1</mail_default_encoding>
</config> </config>

View File

@@ -70,9 +70,9 @@ if __name__ == '__main__':
storage_suite, \ storage_suite, \
dbmstorage_suite, \ dbmstorage_suite, \
sqlitestorage_suite)) sqlitestorage_suite))
# test_support.run_suite(mail_connection_suite) #test_support.run_suite(mail_connection_suite)
# test_support.run_suite(pop3_connection_suite) #test_support.run_suite(pop3_connection_suite)
# test_support.run_suite(imap_connection_suite) #test_support.run_suite(imap_connection_suite)
#test_support.run_suite(mc_factory_suite) #test_support.run_suite(mc_factory_suite)
#test_support.run_suite(component_suite) #test_support.run_suite(component_suite)
#test_support.run_suite(component2_suite) #test_support.run_suite(component2_suite)

View File

@@ -32,6 +32,9 @@ def _create_multipart(encoded):
part2 = MIMEText("Encoded multipart2 with 'iso-8859-15' charset (<28><><EFBFBD>)", \ part2 = MIMEText("Encoded multipart2 with 'iso-8859-15' charset (<28><><EFBFBD>)", \
_charset = "iso-8859-15") _charset = "iso-8859-15")
msg.attach(part2) msg.attach(part2)
part3 = MIMEText("Encoded multipart3 with no charset (<28><><EFBFBD>)", \
_charset = "")
msg.attach(part3)
else: else:
part1 = MIMEText("Not encoded multipart1") part1 = MIMEText("Not encoded multipart1")
msg.attach(part1) msg.attach(part1)

View File

@@ -15,4 +15,5 @@
<storage>SQLite</storage> <storage>SQLite</storage>
<spooldir>.</spooldir> <spooldir>.</spooldir>
<check_interval>5</check_interval> <!-- in minutes --> <check_interval>5</check_interval> <!-- in minutes -->
<mail_default_encoding>iso-8859-15</mail_default_encoding>
</config> </config>

View File

@@ -48,12 +48,12 @@ class MailConnection_TestCase(unittest.TestCase):
test_get_decoded_part_not_encoded = \ test_get_decoded_part_not_encoded = \
make_test((False, False, False), \ make_test((False, False, False), \
lambda self, email: self.connection.get_decoded_part(email), \ lambda self, email: self.connection.get_decoded_part(email, None), \
u"Not encoded single part") u"Not encoded single part")
test_get_decoded_part_encoded = \ test_get_decoded_part_encoded = \
make_test((True, False, False), \ make_test((True, False, False), \
lambda self, email: self.connection.get_decoded_part(email), \ lambda self, email: self.connection.get_decoded_part(email, None), \
u"Encoded single part with 'iso-8859-15' charset (éàê)") u"Encoded single part with 'iso-8859-15' charset (éàê)")
test_format_message_summary_not_encoded = \ test_format_message_summary_not_encoded = \
@@ -104,7 +104,8 @@ class MailConnection_TestCase(unittest.TestCase):
lambda self, email: self.connection.format_message(email), \ lambda self, email: self.connection.format_message(email), \
u"From : encoded from (éàê)\nSubject : encoded subject (éà" + \ u"From : encoded from (éàê)\nSubject : encoded subject (éà" + \
u"ê)\n\nutf-8 multipart1 with no charset (éàê)" + \ u"ê)\n\nutf-8 multipart1 with no charset (éàê)" + \
u"\nEncoded multipart2 with 'iso-8859-15' charset (éàê)\n") u"\nEncoded multipart2 with 'iso-8859-15' charset (éàê)\n" + \
u"Encoded multipart3 with no charset (éàê)\n")
class POP3Connection_TestCase(unittest.TestCase): class POP3Connection_TestCase(unittest.TestCase):