Mail charset error support

- Try multiple charset when no charset is specified by "Content-Type" field :
begins with sys.defaultencoding utf8
then "iso-8859-1", config file encoding and finally encoding found in previous
fields "Subject" or "From".

darcs-hash:20060201110843-86b55-7fd556a8ffadd9a5fed2b3317b17bfcca9d1dc58.gz
This commit is contained in:
David Rousselie
2006-02-01 12:08:43 +01:00
parent 4a5848a66c
commit a063a69773
7 changed files with 64 additions and 13 deletions

View File

@@ -37,6 +37,8 @@ POP3_TIMEOUT = 10
DO_NOTHING = 0
DIGEST = 1
RETRIEVE = 2
default_encoding = "iso-8859-1"
## All MY* classes are implemented to add a timeout (settimeout)
## while connecting
class MYIMAP4(imaplib.IMAP4):
@@ -192,41 +194,82 @@ class MailConnection(object):
+ str(self.online_action) + "#" + str(self.away_action) + "#" + \
str(self.xa_action) + "#" + str(self.dnd_action) + "#" + str(self.offline_action) + "#" + str(self.interval) + "#" + str(self.live_email_only)
def get_decoded_part(self, part):
def get_decoded_part(self, part, charset_hint):
content_charset = part.get_content_charset()
if content_charset:
return part.get_payload(decode=True).decode(content_charset)
return unicode(part.get_payload(decode=True).decode(content_charset))
else:
return part.get_payload(decode=True)
result = ""
try:
result = unicode(part.get_payload(decode=True))
except Exception, e:
try:
result = unicode(part.get_payload(decode=True).decode("iso-8859-1"))
except Exception, e:
try:
result = unicode(part.get_payload(decode=True).decode(default_encoding))
except Exception, e:
if charset_hint is not None:
try:
result = unicode(part.get_payload(decode=True).decode(charset_hint))
except Exception, e:
print e
return result
def format_message(self, email_msg, include_body = True):
from_decoded = email.Header.decode_header(email_msg["From"])
charset_hint = None
result = u"From : "
for i in range(len(from_decoded)):
if from_decoded[i][1]:
charset_hint = from_decoded[i][1]
result += unicode(from_decoded[i][0].decode(from_decoded[i][1]))
else:
result += unicode(from_decoded[i][0])
try:
result += unicode(from_decoded[i][0])
except Exception,e:
try:
result += unicode(from_decoded[i][0].decode("iso-8859-1"))
except Exception, e:
try:
result += unicode(from_decoded[i][0].decode(default_encoding))
except Exception, e:
print e
result += "\n"
subject_decoded = email.Header.decode_header(email_msg["Subject"])
result += u"Subject : "
for i in range(len(subject_decoded)):
if subject_decoded[i][1]:
charset_hint = subject_decoded[i][1]
result += unicode(subject_decoded[i][0].decode(subject_decoded[i][1]))
else:
result += unicode(subject_decoded[i][0])
try:
result += unicode(subject_decoded[i][0])
except Exception,e:
try:
result += unicode(subject_decoded[i][0].decode("iso-8859-1"))
except Exception, e:
try:
result += unicode(subject_decoded[i][0].decode(default_encoding))
except Exception, e:
if charset_hint is not None:
try:
result += unicode(subject_decoded[i][0].decode(charset_hint))
except Exception, e:
print e
result += u"\n\n"
if include_body:
action = {
"text/plain" : lambda part: self.get_decoded_part(part),
"text/plain" : lambda part: self.get_decoded_part(part, charset_hint),
"text/html" : lambda part: "\n<<<HTML part skipped>>>\n"
}
for part in email_msg.walk():
content_type = part.get_content_type()
if action.has_key(content_type):
result += unicode(action[content_type](part)) + u'\n'
result += action[content_type](part) + u'\n'
return result
def format_message_summary(self, email_msg):

2
jmc.py
View File

@@ -26,6 +26,7 @@ import sys
import os.path
import logging
from jabber import mailconnection
from jabber.component import MailComponent, ComponentFatalError
from jabber.config import Config
@@ -46,6 +47,7 @@ def main(config_file = "jmc.xml", isDebug = 0):
str(sys.exc_value)
sys.exit(1)
mailconnection.default_encoding = config.get_content("config/mail_default_encoding")
print "creating component..."
mailcomp = MailComponent(config)

View File

@@ -16,4 +16,5 @@
<spooldir>/var/spool/jabber</spooldir>
<!-- default check interval in minutes -->
<check_interval>5</check_interval>
<mail_default_encoding>iso-8859-1</mail_default_encoding>
</config>

View File

@@ -70,9 +70,9 @@ if __name__ == '__main__':
storage_suite, \
dbmstorage_suite, \
sqlitestorage_suite))
# test_support.run_suite(mail_connection_suite)
# test_support.run_suite(pop3_connection_suite)
# test_support.run_suite(imap_connection_suite)
#test_support.run_suite(mail_connection_suite)
#test_support.run_suite(pop3_connection_suite)
#test_support.run_suite(imap_connection_suite)
#test_support.run_suite(mc_factory_suite)
#test_support.run_suite(component_suite)
#test_support.run_suite(component2_suite)

View File

@@ -32,6 +32,9 @@ def _create_multipart(encoded):
part2 = MIMEText("Encoded multipart2 with 'iso-8859-15' charset (<28><><EFBFBD>)", \
_charset = "iso-8859-15")
msg.attach(part2)
part3 = MIMEText("Encoded multipart3 with no charset (<28><><EFBFBD>)", \
_charset = "")
msg.attach(part3)
else:
part1 = MIMEText("Not encoded multipart1")
msg.attach(part1)

View File

@@ -15,4 +15,5 @@
<storage>SQLite</storage>
<spooldir>.</spooldir>
<check_interval>5</check_interval> <!-- in minutes -->
<mail_default_encoding>iso-8859-15</mail_default_encoding>
</config>

View File

@@ -48,12 +48,12 @@ class MailConnection_TestCase(unittest.TestCase):
test_get_decoded_part_not_encoded = \
make_test((False, False, False), \
lambda self, email: self.connection.get_decoded_part(email), \
lambda self, email: self.connection.get_decoded_part(email, None), \
u"Not encoded single part")
test_get_decoded_part_encoded = \
make_test((True, False, False), \
lambda self, email: self.connection.get_decoded_part(email), \
lambda self, email: self.connection.get_decoded_part(email, None), \
u"Encoded single part with 'iso-8859-15' charset (éàê)")
test_format_message_summary_not_encoded = \
@@ -104,7 +104,8 @@ class MailConnection_TestCase(unittest.TestCase):
lambda self, email: self.connection.format_message(email), \
u"From : encoded from (éàê)\nSubject : encoded subject (éà" + \
u"ê)\n\nutf-8 multipart1 with no charset (éàê)" + \
u"\nEncoded multipart2 with 'iso-8859-15' charset (éàê)\n")
u"\nEncoded multipart2 with 'iso-8859-15' charset (éàê)\n" + \
u"Encoded multipart3 with no charset (éàê)\n")
class POP3Connection_TestCase(unittest.TestCase):