diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,6 +5,7 @@ rev: v4.1.0 hooks: - id: trailing-whitespace + exclude: '.eml$' - id: check-json - id: check-yaml diff --git a/MANIFEST.in b/MANIFEST.in --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,6 +8,7 @@ recursive-include assets * recursive-include swh/web/templates * recursive-include swh/web/tests/resources * +recursive-include swh/web/tests/inbound_email/resources *.eml include package.json include yarn.lock diff --git a/swh/web/inbound_email/utils.py b/swh/web/inbound_email/utils.py --- a/swh/web/inbound_email/utils.py +++ b/swh/web/inbound_email/utils.py @@ -164,3 +164,41 @@ continue return ret + + +def get_message_plaintext(message: EmailMessage) -> Optional[bytes]: + """Get the plaintext body for a given message, if any such part exists. If only a html + part exists, return that instead. + + If there are multiple, ambiguous plain text or html parts in the message, this + function will return the largest of them. + + """ + if not message.is_multipart(): + single_part = message.get_payload(decode=True).rstrip(b"\n") + return single_part or None + + text_parts: List[bytes] = [] + fallback_parts: List[bytes] = [] + + for part in message.walk(): + content_type = part.get_content_type() + content_disposition = str(part.get("Content-Disposition")) + if "attachment" in content_disposition: + continue + if content_type == "text/plain": + current_part = part.get_payload(decode=True).rstrip(b"\n") + if current_part: + text_parts.append(current_part) + elif not text_parts and content_type == "text/html": + current_part = part.get_payload(decode=True).rstrip(b"\n") + if current_part: + fallback_parts.append(current_part) + + if text_parts: + return max(text_parts, key=len) + + if fallback_parts: + return max(fallback_parts, key=len) + + return None diff --git a/swh/web/tests/inbound_email/__init__.py b/swh/web/tests/inbound_email/__init__.py new file mode 100644 diff --git a/swh/web/tests/inbound_email/resources/__init__.py b/swh/web/tests/inbound_email/resources/__init__.py new file mode 100644 diff --git a/swh/web/tests/inbound_email/resources/multipart.eml b/swh/web/tests/inbound_email/resources/multipart.eml new file mode 100644 --- /dev/null +++ b/swh/web/tests/inbound_email/resources/multipart.eml @@ -0,0 +1,24 @@ +Return-Path: +X-Mailer: MessagingEngine.com Webmail Interface +User-Agent: Cyrus-JMAP/3.7.0-alpha0-382-g88b93171a9-fm-20220330.001-g88b93171 +Mime-Version: 1.0 +Message-Id: +Date: Mon, 04 Apr 2022 17:10:00 +0200 +From: "Test User" +To: test@example.com +Subject: Multipart email +Content-Type: multipart/alternative; + boundary=67575b1b68b24603a2d00f02e032c975 + +--67575b1b68b24603a2d00f02e032c975 +Content-Type: text/plain + +*Multipart email.* + +-- +Test User +--67575b1b68b24603a2d00f02e032c975 +Content-Type: text/html + +
Multipart email.

-- 
Test User
+--67575b1b68b24603a2d00f02e032c975-- diff --git a/swh/web/tests/inbound_email/resources/multipart_html_only.eml b/swh/web/tests/inbound_email/resources/multipart_html_only.eml new file mode 100644 --- /dev/null +++ b/swh/web/tests/inbound_email/resources/multipart_html_only.eml @@ -0,0 +1,21 @@ +Return-Path: +X-Mailer: MessagingEngine.com Webmail Interface +User-Agent: Cyrus-JMAP/3.7.0-alpha0-382-g88b93171a9-fm-20220330.001-g88b93171 +Mime-Version: 1.0 +Message-Id: +Date: Mon, 04 Apr 2022 17:10:00 +0200 +From: "Test User" +To: test@example.com +Subject: Multipart email +Content-Type: multipart/alternative; + boundary=67575b1b68b24603a2d00f02e032c975 + +--67575b1b68b24603a2d00f02e032c975 +Content-Type: text/html + +
Multipart email (short html part).

-- 
Test User
+--67575b1b68b24603a2d00f02e032c975 +Content-Type: text/html + +
Multipart email (a much longer html part).

-- 
Test User
+--67575b1b68b24603a2d00f02e032c975-- diff --git a/swh/web/tests/inbound_email/resources/multipart_text_only.eml b/swh/web/tests/inbound_email/resources/multipart_text_only.eml new file mode 100644 --- /dev/null +++ b/swh/web/tests/inbound_email/resources/multipart_text_only.eml @@ -0,0 +1,27 @@ +Return-Path: +X-Mailer: MessagingEngine.com Webmail Interface +User-Agent: Cyrus-JMAP/3.7.0-alpha0-382-g88b93171a9-fm-20220330.001-g88b93171 +Mime-Version: 1.0 +Message-Id: +Date: Mon, 04 Apr 2022 17:10:00 +0200 +From: "Test User" +To: test@example.com +Subject: Multipart email +Content-Type: multipart/alternative; + boundary=67575b1b68b24603a2d00f02e032c975 + +--67575b1b68b24603a2d00f02e032c975 +Content-Type: text/plain + +*Multipart email.* + +-- +Test User +--67575b1b68b24603a2d00f02e032c975 +Content-Type: text/plain + +*Multipart email, but a longer text part.* + +-- +Test User +--67575b1b68b24603a2d00f02e032c975-- diff --git a/swh/web/tests/inbound_email/resources/plaintext.eml b/swh/web/tests/inbound_email/resources/plaintext.eml new file mode 100644 --- /dev/null +++ b/swh/web/tests/inbound_email/resources/plaintext.eml @@ -0,0 +1,15 @@ +Return-Path: +X-Mailer: MessagingEngine.com Webmail Interface +User-Agent: Cyrus-JMAP/3.7.0-alpha0-382-g88b93171a9-fm-20220330.001-g88b93171 +Mime-Version: 1.0 +Message-Id: <21e17bca-d6a7-40fb-bab8-5dadd939835b@www.fastmail.com> +Date: Mon, 04 Apr 2022 17:08:04 +0200 +From: "Test User" +To: test@example.com +Subject: Plain text email +Content-Type: text/plain + +Plain text email. + +-- +Test User diff --git a/swh/web/tests/inbound_email/test_utils.py b/swh/web/tests/inbound_email/test_utils.py --- a/swh/web/tests/inbound_email/test_utils.py +++ b/swh/web/tests/inbound_email/test_utils.py @@ -3,8 +3,14 @@ # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information +import email from email.headerregistry import Address from email.message import EmailMessage +import email.policy +from importlib.resources import open_binary +from typing import List + +import pytest from swh.web.inbound_email import utils @@ -241,3 +247,52 @@ assert relevant_records[1].levelname == "DEBUG" print(relevant_records) assert f"{mangled_address} failed" in relevant_records[1].getMessage() + + +@pytest.mark.parametrize( + "filename,expected_parts,expected_absent", + ( + pytest.param( + "plaintext.eml", + [b"Plain text email.\n\n-- \nTest User"], + [], + id="plaintext", + ), + pytest.param( + "multipart.eml", + [b"*Multipart email.*\n\n-- \nTest User"], + [], + id="multipart", + ), + pytest.param( + "multipart_html_only.eml", + [b"", b"Multipart email (a much longer html part)."], + [b"Multipart email (short html part)"], + id="multipart_html_only", + ), + pytest.param( + "multipart_text_only.eml", + [b"*Multipart email, but a longer text part.*\n\n--\nTest User"], + [], + id="multipart_text_only", + ), + ), +) +def test_get_message_plaintext( + filename: str, expected_parts: List[bytes], expected_absent: List[bytes] +): + with open_binary("swh.web.tests.inbound_email.resources", filename) as f: + message = email.message_from_binary_file(f, policy=email.policy.default) + + assert isinstance(message, EmailMessage) + + plaintext = utils.get_message_plaintext(message) + assert plaintext is not None + + if len(expected_parts) == 1: + assert plaintext == expected_parts[0] + else: + for part in expected_parts: + assert part in plaintext + for part in expected_absent: + assert part not in plaintext