diff --git a/swh/web/inbound_email/utils.py b/swh/web/inbound_email/utils.py --- a/swh/web/inbound_email/utils.py +++ b/swh/web/inbound_email/utils.py @@ -5,9 +5,9 @@ from dataclasses import dataclass from email.headerregistry import Address -from email.message import EmailMessage +from email.message import EmailMessage, Message import logging -from typing import List, Optional, Set +from typing import List, Optional, Set, Tuple, Union from django.core.signing import Signer from django.utils.crypto import constant_time_compare @@ -166,6 +166,59 @@ return ret +def _get_message_text( + message: Union[Message, EmailMessage] +) -> Tuple[bool, List[bytes]]: + """Recursively parses a message, and returns ``(is_plain_text, parts)``.""" + + # Ignore all attachments; only consider message body + content_disposition = str(message.get("Content-Disposition")) + if "attachment" in content_disposition: + return (False, []) + + maintype = message.get_content_maintype() + subtype = message.get_content_subtype() + if maintype == "text": + # This is a simple message (message part) + current_part = message.get_payload(decode=True).rstrip(b"\n") + if subtype == "plain": + if current_part: + return (True, [current_part]) + elif subtype == "html": + if current_part: + return (False, [current_part]) + return (True, []) + elif maintype == "multipart": + # This message (message part) contains sub-parts. + text_parts: List[bytes] = [] + fallback_parts: List[bytes] = [] + all_parts: List[bytes] = [] + + # Parse each part independently: + for part in message.get_payload(): + (is_plain_text, current_part) = _get_message_text(part) + if is_plain_text: + text_parts.append(b"".join(current_part)) + else: + fallback_parts.append(b"".join(current_part)) + all_parts.extend(current_part) + + if subtype == "alternative": + # Return the largest plain text part if any; or the largest HTML otherwise + if text_parts: + return (True, [max(text_parts, key=len)]) + + if fallback_parts: + return (False, [max(fallback_parts, key=len)]) + else: + # Handles multipart/mixed; but this should be an appropriate handling for + # other multipart formats + is_plain_text = len(fallback_parts) == 0 + return (is_plain_text, all_parts) + + return (False, []) + + def get_message_plaintext(message: EmailMessage) -> Optional[bytes]: """Get the plaintext body for a given message, if any such part exists. If only a html part exists, return that instead. @@ -174,40 +227,5 @@ function will return the largest of them. """ - if not message.is_multipart(): - single_part = message.get_payload(decode=True).rstrip(b"\n") - return single_part or None - - text_parts: List[bytes] = [] - fallback_parts: List[bytes] = [] - all_parts: List[bytes] = [] - - for part in message.walk(): - content_type = part.get_content_type() - content_disposition = str(part.get("Content-Disposition")) - if "attachment" in content_disposition: - continue - if content_type == "text/plain": - current_part = part.get_payload(decode=True).rstrip(b"\n") - if current_part: - text_parts.append(current_part) - all_parts.append(current_part) - elif content_type == "text/html": - current_part = part.get_payload(decode=True).rstrip(b"\n") - if current_part: - fallback_parts.append(current_part) - all_parts.append(current_part) - - assert message.get_content_maintype() == "multipart" - if message.get_content_subtype() == "alternative": - if text_parts: - return max(text_parts, key=len) - - if fallback_parts: - return max(fallback_parts, key=len) - else: - # Handles multipart/mixed; but this should be an appropriate handling for - # other multipart formats - return b"".join(all_parts) - - return None + (is_plain_text, parts) = _get_message_text(message) + return b"".join(parts) or None diff --git a/swh/web/tests/inbound_email/resources/multipart_alternative_recursive.eml b/swh/web/tests/inbound_email/resources/multipart_alternative_recursive.eml new file mode 100644 --- /dev/null +++ b/swh/web/tests/inbound_email/resources/multipart_alternative_recursive.eml @@ -0,0 +1,45 @@ +Return-Path: +Content-Type: multipart/alternative; boundary="------------boundary1" +Message-ID: <8318382c-b4d3-9239-0805-e8cff4b9187a@inria.fr> +Date: Thu, 7 Apr 2022 10:30:36 +0200 +MIME-Version: 1.0 +User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 + Thunderbird/91.7.0 +Content-Language: en-US +To: Test User +From: Test User +Subject: test email + +This is a multi-part message in MIME format. +--------------boundary1 +Content-Type: multipart/mixed; boundary="------------boundary2" + +This is a multi-part message in MIME format. + +------------boundary2 +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 7bit + +This is plain text + +--------------boundary2 +Content-Type: text/html + +and this is HTML + +--------------boundary1 +Content-Type: multipart/mixed; boundary="------------boundary3" + +This is a multi-part message in MIME format. + +--------------boundary3 +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 7bit + +This is plain text + +--------------boundary3 +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 7bit + +and more plain text diff --git a/swh/web/tests/inbound_email/resources/multipart_related.eml b/swh/web/tests/inbound_email/resources/multipart_related.eml new file mode 100644 --- /dev/null +++ b/swh/web/tests/inbound_email/resources/multipart_related.eml @@ -0,0 +1,42 @@ +MIME-Version: 1.0 +From: Forwarder +Date: Fri, 18 Feb 2022 09:40:39 +0100 +Content-Type: multipart/related; boundary="00000000000015426c05d846d79d" +Subject: Fwd: Look at this + +--00000000000015426c05d846d79d +Content-Type: multipart/alternative; boundary="00000000000015426b05d846d79c" + +--00000000000015426b05d846d79c +Content-Type: text/plain; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +See the message below + +---------- Forwarded message --------- +From: Test User +Date: Thu, 17 Feb 2022 at 17:00 +Subject: Look at this + + +Hello everyone, + +See my attachment + + +--00000000000015426b05d846d79c +Content-Type: text/html; charset="UTF-8" +Content-Transfer-Encoding: quoted-printable + +Hello everyone, + +--00000000000015426b05d846d79c-- +--00000000000015426c05d846d79d +Content-Type: image/png; name="attachment.png" +Content-Disposition: inline; filename="attachment.png" +Content-ID: +X-Attachment-Id: my-attachment-id + +This is an image +--00000000000015426c05d846d79d-- diff --git a/swh/web/tests/inbound_email/test_utils.py b/swh/web/tests/inbound_email/test_utils.py --- a/swh/web/tests/inbound_email/test_utils.py +++ b/swh/web/tests/inbound_email/test_utils.py @@ -298,6 +298,21 @@ ], id="multipart_mixed_text_only", ), + pytest.param( + "multipart_alternative_recursive.eml", + [b"This is plain text", b"and more plain text"], + [b"this is HTML", b"This is a multi-part message in MIME format."], + id="multipart_alternative_recursive", + ), + pytest.param( + "multipart_related.eml", + [ + b"See the message below\n\n---------- Forwarded message ---------", + b"Hello everyone,\n\nSee my attachment", + ], + [b"this is HTML", b"This is a multi-part message in MIME format."], + id="multipart_alternative_recursive", + ), ), ) def test_get_message_plaintext(