Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion ingestors/email/msg.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import re
import email
import logging
from email.header import decode_header, make_header
from email.policy import default
from email.errors import MessageError
from html import escape
Expand All @@ -13,6 +15,16 @@

log = logging.getLogger(__name__)

linesep_splitter = re.compile(r"\n|\r")


def my_header_fetch_parse(name, value):
if hasattr(value, "name"):
return value
v = str(make_header(decode_header(value)))
v = "".join(linesep_splitter.split(v))
return email.policy.default.header_factory(name, v)


class RFC822Ingestor(Ingestor, EmailSupport, EncodingSupport):
MIME_TYPES = ["multipart/mixed", "message/rfc822"]
Expand Down Expand Up @@ -131,7 +143,8 @@ def ingest(self, file_path, entity):
entity.schema = model.get("Email")
try:
with open(file_path, "rb") as fh:
msg = email.message_from_binary_file(fh, policy=default)
policy = default.clone(header_fetch_parse=my_header_fetch_parse)
msg = email.message_from_binary_file(fh, policy=policy)
except (MessageError, ValueError, IndexError) as err:
raise ProcessingException("Cannot parse email: %s" % err) from err

Expand Down
27 changes: 27 additions & 0 deletions tests/fixtures/email_multiline_headers.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
From redacted@example.com Mon Sep 26 01:29:17 2016
Return-Path: <redacted@example.com>
MIME-Version: 1.0
Content-Type: multipart/mixed;
boundary="=_cdba8b1f8013db8af57404a79c9f2707"
Date: Fri, 29 Apr 2016 11:17:12 +0300
From: =?UTF-8?Q?=D0=9E=D1=82=D0=B4=D0=B5=D0=BB_=D0=BF=D0=BE_=D1=80=D0=B0?=
=?UTF-8?Q?=D0=B1=D0=BE=D1=82=D0=B5_=D1=81_=D0=BF=D1=80=D0=BE=D1=85=D0=BE?=
=?UTF-8?Q?=D0=B6=D0=B4=D0=B5=D0=BD=D0=B8=D0=B5=D0=BC_=D0=B7=D0=B0=D0=BA?=
=?UTF-8?Q?=D0=BE=D0=BD=D0=BE=D0=BF=D1=80=D0=BE=D0=B5=D0=BA=D1=82=D0=BE?=
=?UTF-8?Q?=D0=B2?= <redacted@example.com>
To: =?UTF-8?Q?=D0=91=D0=BE=D1=80=D0=B8=D1=81=D0=BE=D0=B2?=
<redacted@example.org>
Subject: =?UTF-8?Q?=D0=94=D0=BE=D0=BF=D0=BE=D0=BB=D0=BD=D0=B8=D1=82=D0=B5?=
=?UTF-8?Q?=D0=BB=D1=8C=D0=BD=D0=BE=D0=B5_=D0=B7=D0=B0=D0=BA=D0=BB=D1=8E?=
=?UTF-8?Q?=D1=87=D0=B5=D0=BD=D0=B8=D0=B5_=D0=9F=D1=80=D0=BE=D1=84=D0=B8?=
=?UTF-8?Q?=D0=BB=D1=8C=D0=BD=D0=BE=D0=B3=D0=BE_=D0=9A=D0=BE=D0=BC=D0=B8?=
=?UTF-8?Q?=D1=82=D0=B5=D1=82=D0=B0_=D0=BD=D0=B0_=D0=BF=D1=80=D0=BE=D0=B5?=
=?UTF-8?Q?=D0=BA=D1=82_=D0=B7=D0=B0=D0=BA=D0=BE=D0=BD=D0=B0_=E2=84=96145-?=
=?UTF-8?Q?=D0=94?=
Message-ID: <ccdff393603faa4df447dc8903c14a57@example.com>
X-Sender: redacted@example.com
User-Agent: Roundcube Webmail/1.1.1

--=_cdba8b1f8013db8af57404a79c9f2707
--=_cdba8b1f8013db8af57404a79c9f2707--

17 changes: 17 additions & 0 deletions tests/test_msg.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,20 @@ def test_attached_inline_email(self):
"This is the body of a plaintext message.",
],
)

def test_headers(self):
fixture_path, entity = self.fixture("email_multiline_headers.eml")
self.manager.ingest(fixture_path, entity)
self.assertSuccess(entity)
self.assertEqual(
entity.get("from"),
[
"Отдел по работе с прохождением законопроектов <redacted@example.com>",
],
)
self.assertEqual(
entity.get("subject"),
[
"Дополнительное заключение Профильного Комитета на проект закона №145-Д",
],
)