diff --git a/swh/web/inbound_email/utils.py b/swh/web/inbound_email/utils.py index 9a210626..a7252993 100644 --- a/swh/web/inbound_email/utils.py +++ b/swh/web/inbound_email/utils.py @@ -1,204 +1,213 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import dataclass from email.headerregistry import Address from email.message import EmailMessage import logging from typing import List, Optional, Set from django.core.signing import Signer from django.utils.crypto import constant_time_compare logger = logging.getLogger(__name__) def extract_recipients(message: EmailMessage) -> List[Address]: """Extract a list of recipients of the `message`. This uses the ``To`` and ``Cc`` fields. """ ret = [] for header_name in ("to", "cc"): for header in message.get_all(header_name, []): ret.extend(header.addresses) return ret @dataclass class AddressMatch: """Data related to a recipient match""" recipient: Address """The original recipient that matched the expected address""" extension: Optional[str] """The parsed +-extension of the matched recipient address""" def single_recipient_matches( recipient: Address, address: str ) -> Optional[AddressMatch]: """Check whether a single address matches the provided base address. The match is case-insensitive, which is not really RFC-compliant but is consistent with what most people would expect. This function supports "+-addressing", where the local part of the email address is appended with a `+`. """ parsed_address = Address(addr_spec=address.lower()) if recipient.domain.lower() != parsed_address.domain: return None base_username, _, extension = recipient.username.partition("+") if base_username.lower() != parsed_address.username: return None return AddressMatch(recipient=recipient, extension=extension or None) def recipient_matches(message: EmailMessage, address: str) -> List[AddressMatch]: """Check whether any of the message recipients match the given address. The match is case-insensitive, which is not really RFC-compliant but matches what most people would expect. This function supports "+-addressing", where the local part of the email address is appended with a `+`. """ ret = [] for recipient in extract_recipients(message): match = single_recipient_matches(recipient, address) if match: ret.append(match) return ret ADDRESS_SIGNER_SEP = "." """Separator for email address signatures""" def get_address_signer(salt: str) -> Signer: """Get a signer for the given seed""" return Signer(salt=salt, sep=ADDRESS_SIGNER_SEP) def get_address_for_pk(salt: str, base_address: str, pk: int) -> str: """Get the email address that will be able to receive messages to be logged in this request.""" if "@" not in base_address: raise ValueError("Base address needs to contain an @") username, domain = base_address.split("@") extension = get_address_signer(salt).sign(str(pk)) return f"{username}+{extension}@{domain}" def get_pk_from_extension(salt: str, extension: str) -> int: """Retrieve the primary key for the given inbound address extension. We reimplement `Signer.unsign`, because the extension can be casemapped at any point in the email chain (even though email is, theoretically, case sensitive), so we have to compare lowercase versions of both the extension and the signature... Raises ValueError if the signature couldn't be verified. """ value, signature = extension.rsplit(ADDRESS_SIGNER_SEP, 1) expected_signature = get_address_signer(salt).signature(value) if not constant_time_compare(signature.lower(), expected_signature.lower()): raise ValueError(f"Invalid signature for extension {extension}") return int(value) def get_pks_from_message( salt: str, base_address: str, message: EmailMessage ) -> Set[int]: """Retrieve the set of primary keys that were successfully decoded from the recipients of the ``message`` matching ``base_address``. This uses :func:`recipient_matches` to retrieve all the recipient addresses matching ``base_address``, then :func:`get_pk_from_extension` to decode the primary key and verify the signature for every extension. To generate relevant email addresses, use :func:`get_address_for_pk` with the same ``base_address`` and ``salt``. Returns: the set of primary keys that were successfully decoded from the recipients of the ``message`` """ ret: Set[int] = set() for match in recipient_matches(message, base_address): extension = match.extension if extension is None: logger.debug( "Recipient address %s cannot be matched to a request, ignoring", match.recipient.addr_spec, ) continue try: ret.add(get_pk_from_extension(salt, extension)) except ValueError: logger.debug( "Recipient address %s failed validation", match.recipient.addr_spec ) continue return ret def get_message_plaintext(message: EmailMessage) -> Optional[bytes]: """Get the plaintext body for a given message, if any such part exists. If only a html part exists, return that instead. If there are multiple, ambiguous plain text or html parts in the message, this function will return the largest of them. """ if not message.is_multipart(): single_part = message.get_payload(decode=True).rstrip(b"\n") return single_part or None text_parts: List[bytes] = [] fallback_parts: List[bytes] = [] + all_parts: List[bytes] = [] for part in message.walk(): content_type = part.get_content_type() content_disposition = str(part.get("Content-Disposition")) if "attachment" in content_disposition: continue if content_type == "text/plain": current_part = part.get_payload(decode=True).rstrip(b"\n") if current_part: text_parts.append(current_part) - elif not text_parts and content_type == "text/html": + all_parts.append(current_part) + elif content_type == "text/html": current_part = part.get_payload(decode=True).rstrip(b"\n") if current_part: fallback_parts.append(current_part) - - if text_parts: - return max(text_parts, key=len) - - if fallback_parts: - return max(fallback_parts, key=len) + all_parts.append(current_part) + + assert message.get_content_maintype() == "multipart" + if message.get_content_subtype() == "alternative": + if text_parts: + return max(text_parts, key=len) + + if fallback_parts: + return max(fallback_parts, key=len) + else: + # Handles multipart/mixed; but this should be an appropriate handling for + # other multipart formats + return b"".join(all_parts) return None diff --git a/swh/web/tests/inbound_email/resources/multipart.eml b/swh/web/tests/inbound_email/resources/multipart_alternative.eml similarity index 100% rename from swh/web/tests/inbound_email/resources/multipart.eml rename to swh/web/tests/inbound_email/resources/multipart_alternative.eml diff --git a/swh/web/tests/inbound_email/resources/multipart_html_only.eml b/swh/web/tests/inbound_email/resources/multipart_alternative_html_only.eml similarity index 100% rename from swh/web/tests/inbound_email/resources/multipart_html_only.eml rename to swh/web/tests/inbound_email/resources/multipart_alternative_html_only.eml diff --git a/swh/web/tests/inbound_email/resources/multipart_text_only.eml b/swh/web/tests/inbound_email/resources/multipart_alternative_text_only.eml similarity index 100% rename from swh/web/tests/inbound_email/resources/multipart_text_only.eml rename to swh/web/tests/inbound_email/resources/multipart_alternative_text_only.eml diff --git a/swh/web/tests/inbound_email/resources/multipart_mixed.eml b/swh/web/tests/inbound_email/resources/multipart_mixed.eml new file mode 100644 index 00000000..ff5f0dfa --- /dev/null +++ b/swh/web/tests/inbound_email/resources/multipart_mixed.eml @@ -0,0 +1,23 @@ +Return-Path: +Content-Type: multipart/mixed; boundary="------------BR6CvS06rj0369ladb31gYCL" +Message-ID: <8318382c-b4d3-9239-0805-e8cff4b9187a@inria.fr> +Date: Thu, 7 Apr 2022 10:30:36 +0200 +MIME-Version: 1.0 +User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 + Thunderbird/91.7.0 +Content-Language: en-US +To: Test User +From: Test User +Subject: test email + +This is a multi-part message in MIME format. +--------------BR6CvS06rj0369ladb31gYCL +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 7bit + +This is plain text + +--------------BR6CvS06rj0369ladb31gYCL +Content-Type: text/html + +and this is HTML diff --git a/swh/web/tests/inbound_email/resources/multipart_mixed2.eml b/swh/web/tests/inbound_email/resources/multipart_mixed2.eml new file mode 100644 index 00000000..93cf4ea2 --- /dev/null +++ b/swh/web/tests/inbound_email/resources/multipart_mixed2.eml @@ -0,0 +1,25 @@ +Return-Path: +Content-Type: multipart/mixed; boundary="------------BR6CvS06rj0369ladb31gYCL" +Message-ID: <8318382c-b4d3-9239-0805-e8cff4b9187a@inria.fr> +Date: Thu, 7 Apr 2022 10:30:36 +0200 +MIME-Version: 1.0 +User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 + Thunderbird/91.7.0 +Content-Language: en-US +To: Test User +From: Test User +Subject: test email + +This is a multi-part message in MIME format. +--------------BR6CvS06rj0369ladb31gYCL +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 7bit + +This is plain text + +--------------BR6CvS06rj0369ladb31gYCL +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 7bit + +and this is more text + diff --git a/swh/web/tests/inbound_email/resources/multipart_mixed_text_only.eml b/swh/web/tests/inbound_email/resources/multipart_mixed_text_only.eml new file mode 100644 index 00000000..d80c458f --- /dev/null +++ b/swh/web/tests/inbound_email/resources/multipart_mixed_text_only.eml @@ -0,0 +1,36 @@ +Return-Path: +Content-Type: multipart/mixed; boundary="------------BR6CvS06rj0369ladb31gYCL" +Message-ID: <8318382c-b4d3-9239-0805-e8cff4b9187a@inria.fr> +Date: Thu, 7 Apr 2022 10:30:36 +0200 +MIME-Version: 1.0 +User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 + Thunderbird/91.7.0 +Content-Language: en-US +To: Test User +From: Test User +Subject: test email + +This is a multi-part message in MIME format. +--------------BR6CvS06rj0369ladb31gYCL +Content-Type: text/plain; charset=UTF-8; format=flowed +Content-Transfer-Encoding: 7bit + +My test email +--------------BR6CvS06rj0369ladb31gYCL +Content-Type: text/html; charset=UTF-8; name="attachment.html" +Content-Disposition: attachment; filename="attachment.html" +Content-Transfer-Encoding: base64 + +PGh0bWw+CiAgICA8aGVhZD4KICAgICAgICA8dGl0bGU+VGhpcyBpcyBhbiBleGFtcGxlIEhU +TUwgYXR0YWNobWVudDwvdGl0bGU+CiAgICA8L2hlYWQ+CiAgICA8Ym9keT4KICAgICAgICA8 +cD5UaGlzIGlzIGFuIGV4YW1wbGUgSFRNTCBhdHRhY2htZW50PC9wPgogICAgPC9ib2R5Pgo8 +L2h0bWw+Cg== +--------------BR6CvS06rj0369ladb31gYCL +Content-Type: text/plain; charset=UTF-8; name="attachment.txt" +Content-Disposition: attachment; filename="attachment.txt" +Content-Transfer-Encoding: base64 + +VGhpcyBpcyBhIHRleHQgYXR0YWNobWVudC4K + +--------------BR6CvS06rj0369ladb31gYCL-- + diff --git a/swh/web/tests/inbound_email/test_utils.py b/swh/web/tests/inbound_email/test_utils.py index e82106e7..35923af1 100644 --- a/swh/web/tests/inbound_email/test_utils.py +++ b/swh/web/tests/inbound_email/test_utils.py @@ -1,299 +1,321 @@ # Copyright (C) 2022 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU Affero General Public License version 3, or any later version # See top-level LICENSE file for more information import email from email.headerregistry import Address from email.message import EmailMessage import email.policy from importlib.resources import open_binary from typing import List import pytest from swh.web.inbound_email import utils def test_extract_recipients(): message = EmailMessage() assert utils.extract_recipients(message) == [] message["To"] = "Test Recipient " assert utils.extract_recipients(message) == [ Address(display_name="Test Recipient", addr_spec="test-recipient@example.com") ] message["Cc"] = ( "test-recipient-2@example.com, " "Another Test Recipient " ) assert utils.extract_recipients(message) == [ Address(display_name="Test Recipient", addr_spec="test-recipient@example.com"), Address(addr_spec="test-recipient-2@example.com"), Address( display_name="Another Test Recipient", addr_spec="test-recipient-3@example.com", ), ] del message["To"] assert utils.extract_recipients(message) == [ Address(addr_spec="test-recipient-2@example.com"), Address( display_name="Another Test Recipient", addr_spec="test-recipient-3@example.com", ), ] def test_single_recipient_matches(): assert ( utils.single_recipient_matches( Address(addr_spec="test@example.com"), "match@example.com" ) is None ) assert utils.single_recipient_matches( Address(addr_spec="match@example.com"), "match@example.com" ) == utils.AddressMatch( recipient=Address(addr_spec="match@example.com"), extension=None ) assert utils.single_recipient_matches( Address(addr_spec="MaTch+12345AbC@exaMple.Com"), "match@example.com" ) == utils.AddressMatch( recipient=Address(addr_spec="MaTch+12345AbC@exaMple.Com"), extension="12345AbC" ) def test_recipient_matches(): message = EmailMessage() assert utils.recipient_matches(message, "match@example.com") == [] message = EmailMessage() message["to"] = "nomatch@example.com" assert utils.recipient_matches(message, "match@example.com") == [] message = EmailMessage() message["to"] = "match@example.com" assert utils.recipient_matches(message, "match@example.com") == [ utils.AddressMatch( recipient=Address(addr_spec="match@example.com"), extension=None ) ] message = EmailMessage() message["to"] = "match+extension@example.com" assert utils.recipient_matches(message, "match@example.com") == [ utils.AddressMatch( recipient=Address(addr_spec="match+extension@example.com"), extension="extension", ) ] message = EmailMessage() message["to"] = "match+weird+plussed+extension@example.com" assert utils.recipient_matches(message, "match@example.com") == [ utils.AddressMatch( recipient=Address(addr_spec="match+weird+plussed+extension@example.com"), extension="weird+plussed+extension", ) ] message = EmailMessage() message["to"] = "nomatch@example.com" message["cc"] = ", ".join( ( "match@example.com", "match@notamatch.example.com", "Another Match ", ) ) assert utils.recipient_matches(message, "match@example.com") == [ utils.AddressMatch( recipient=Address(addr_spec="match@example.com"), extension=None, ), utils.AddressMatch( recipient=Address( display_name="Another Match", addr_spec="match+extension@example.com" ), extension="extension", ), ] def test_recipient_matches_casemapping(): message = EmailMessage() message["to"] = "match@example.com" assert utils.recipient_matches(message, "Match@Example.Com") assert utils.recipient_matches(message, "match@example.com") message = EmailMessage() message["to"] = "Match+weirdCaseMapping@Example.Com" matches = utils.recipient_matches(message, "match@example.com") assert matches assert matches[0].extension == "weirdCaseMapping" def test_get_address_for_pk(): salt = "test_salt" pks = [1, 10, 1000] base_address = "base@example.com" addresses = { pk: utils.get_address_for_pk(salt=salt, base_address=base_address, pk=pk) for pk in pks } assert len(set(addresses.values())) == len(addresses) for pk, address in addresses.items(): localpart, _, domain = address.partition("@") base_localpart, _, extension = localpart.partition("+") assert domain == "example.com" assert base_localpart == "base" assert extension.startswith(f"{pk}.") def test_get_address_for_pk_salt(): pk = 1000 base_address = "base@example.com" addresses = [ utils.get_address_for_pk(salt=salt, base_address=base_address, pk=pk) for salt in ["salt1", "salt2"] ] assert len(addresses) == len(set(addresses)) def test_get_pks_from_message(): salt = "test_salt" pks = [1, 10, 1000] base_address = "base@example.com" addresses = { pk: utils.get_address_for_pk(salt=salt, base_address=base_address, pk=pk) for pk in pks } message = EmailMessage() message["To"] = "test@example.com" assert utils.get_pks_from_message(salt, base_address, message) == set() message = EmailMessage() message["To"] = f"Test Address <{addresses[1]}>" assert utils.get_pks_from_message(salt, base_address, message) == {1} message = EmailMessage() message["To"] = f"Test Address <{addresses[1]}>" message["Cc"] = ", ".join( [ f"Test Address <{addresses[1]}>", f"Another Test Address <{addresses[10].lower()}>", "A Third Address ", ] ) assert utils.get_pks_from_message(salt, base_address, message) == {1, 10} def test_get_pks_from_message_logging(caplog): salt = "test_salt" pks = [1, 10, 1000] base_address = "base@example.com" addresses = { pk: utils.get_address_for_pk(salt=salt, base_address=base_address, pk=pk) for pk in pks } message = EmailMessage() message["To"] = f"Test Address <{base_address}>" assert utils.get_pks_from_message(salt, base_address, message) == set() relevant_records = [ record for record in caplog.records if record.name == "swh.web.inbound_email.utils" ] assert len(relevant_records) == 1 assert relevant_records[0].levelname == "DEBUG" assert ( f"{base_address} cannot be matched to a request" in relevant_records[0].getMessage() ) # Replace the signature with "mangle{signature}" mangled_address = addresses[1].replace(".", ".mangle", 1) message = EmailMessage() message["To"] = f"Test Address <{mangled_address}>" assert utils.get_pks_from_message(salt, base_address, message) == set() relevant_records = [ record for record in caplog.records if record.name == "swh.web.inbound_email.utils" ] assert len(relevant_records) == 2 assert relevant_records[0].levelname == "DEBUG" assert relevant_records[1].levelname == "DEBUG" assert f"{mangled_address} failed" in relevant_records[1].getMessage() @pytest.mark.parametrize( "filename,expected_parts,expected_absent", ( pytest.param( "plaintext.eml", [b"Plain text email.\n\n-- \nTest User"], [], id="plaintext", ), pytest.param( - "multipart.eml", + "multipart_alternative.eml", [b"*Multipart email.*\n\n-- \nTest User"], [], - id="multipart", + id="multipart_alternative", ), pytest.param( - "multipart_html_only.eml", + "multipart_alternative_html_only.eml", [b"", b"Multipart email (a much longer html part)."], [b"Multipart email (short html part)"], - id="multipart_html_only", + id="multipart_alternative_html_only", ), pytest.param( - "multipart_text_only.eml", + "multipart_alternative_text_only.eml", [b"*Multipart email, but a longer text part.*\n\n--\nTest User"], [], - id="multipart_text_only", + id="multipart_alternative_text_only", + ), + pytest.param( + "multipart_mixed.eml", + [b"This is plain text", b"and this is HTML"], + [b"This is a multi-part message in MIME format."], + id="multipart_mixed", + ), + pytest.param( + "multipart_mixed2.eml", + [b"This is plain text", b"and this is more text"], + [b"This is a multi-part message in MIME format."], + id="multipart_mixed2", + ), + pytest.param( + "multipart_mixed_text_only.eml", + [b"My test email"], + [ + b"HTML attachment", + b"text attachment", + b"This is a multi-part message in MIME format.", + ], + id="multipart_mixed_text_only", ), ), ) def test_get_message_plaintext( filename: str, expected_parts: List[bytes], expected_absent: List[bytes] ): with open_binary("swh.web.tests.inbound_email.resources", filename) as f: message = email.message_from_binary_file(f, policy=email.policy.default) assert isinstance(message, EmailMessage) plaintext = utils.get_message_plaintext(message) assert plaintext is not None if len(expected_parts) == 1: assert plaintext == expected_parts[0] else: for part in expected_parts: assert part in plaintext for part in expected_absent: assert part not in plaintext