From 042c6f364ec701c5fe1df725e8ee2e6a0ff17b9b Mon Sep 17 00:00:00 2001 From: Sean Whalen Date: Sun, 23 Mar 2025 20:52:47 -0400 Subject: [PATCH 1/4] Account for empty and invalid address headers Python began strict email address format verification [by default](https://github.com/python/cpython/commit/4a153a1d3b18803a684cd1bcc2cdf3ede3dbae19) in Python 3.13 and backported the changes for [security reasons](https://github.com/advisories/GHSA-5mwm-wccq-xqcp). As a result, when an address header is not used or malformed, `mail-parser` will return `[('','')]` (issues #132 and #133) To fix these issues while maintaining the security of the default `strict=True` option in `email.utils.parseaddr` and `email.utils.getaddresses`, the following changes were made to `mail-parser`: - The existing constant `ADDRESSES_HEADERS` list now only includes headers that can contain multiple addresses - `bcc` - `cc` - `reply-to` - `to` - A new constant `ADDRESS_HEADERS` list includes headers that can only contain one address - `delivered-to` - `from` - `sender` - Header parsing is only attempted if the header exists and has a value (Closes #133) - Headers in the `ADDRESS_HEADERS` list are parsed using `email.utils.parseaddr` instead of `email.utils.getaddresses`, returning a tuple instead of a list of tuples - For headers in either list, if an invalid address header is detected, a string stating `Invalid {} header` is added to the `defects` list, where `{}` is the name of the header, and `has_defects` is set to `True` - Invalid headers in the `ADDRESS_HEADERS` are parsed manually if `email.utils.parseaddr` considers the address invalid, in order to show the intent of the defect on mail clients (Closes #132) ## Demo email 1 ```enail From: alice@example.com To: example@example.com Subject: Example Email Hello world! ``` ## Demo 1 JSON output before the changes ```json { "from": [ [ "", "" ] ], "delivered-to": [ [ "", "" ] ], "cc": [ [ "", "" ] ], "body": "Hello world!", "to_domains": [ "", "example.com" ], "reply-to": [ [ "", "" ] ], "subject": "Example Email", "bcc": [ [ "", "" ] ], "to": [ [ "", "example@example.com" ] ], "has_defects": false } ``` ## Demo 1 JSON output after the changes ```json { "to": [ [ "", "example@example.com" ] ], "body": "Hello world!", "from": [ "alice@example.com", "bob@example.com" ], "subject": "Example Email", "to_domains": [ "example.com" ], "has_defects": true, "defects": [ "Invalid from header" ], "defects_categories": [] } ``` ## Demo email 2 ```enail From: alice@example.com To: bob@example.com Subject: Example Email Hello world! ``` ## Demo 2 JSON output before the changes ```json { "from": [ [ "", "" ] ], "delivered-to": [ [ "", "" ] ], "cc": [ [ "", "" ] ], "body": "Hello world!", "to_domains": [ "", "example.com" ], "reply-to": [ [ "", "" ] ], "subject": "Example Email", "bcc": [ [ "", "" ] ], "to": [ [ "", "bob@example.com" ] ], "has_defects": false } ``` ## Demo 2 JSON output after the changes ```json { "to": [ [ "", "bob@example.com" ] ], "body": "Hello world!", "from": [ "", "alice@example.com" ], "subject": "Example Email", "to_domains": [ "example.com" ], "has_defects": false, "defects": [], "defects_categories": [] } ``` --- src/mailparser/const.py | 3 ++- src/mailparser/core.py | 28 ++++++++++++++++++++++++++-- src/mailparser/utils.py | 6 ++++-- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/mailparser/const.py b/src/mailparser/const.py index 2b223f5..ce44b7f 100644 --- a/src/mailparser/const.py +++ b/src/mailparser/const.py @@ -78,7 +78,8 @@ EPILOGUE_DEFECTS = {"StartBoundaryNotFoundDefect"} -ADDRESSES_HEADERS = set(["bcc", "cc", "delivered-to", "from", "reply-to", "to"]) +ADDRESS_HEADERS = set(["delivered-to", "from", "sender"]) +ADDRESSES_HEADERS = set(["bcc", "cc", "reply-to", "to"]) # These parts are always returned OTHERS_PARTS = set( diff --git a/src/mailparser/core.py b/src/mailparser/core.py index 353e326..10794f1 100644 --- a/src/mailparser/core.py +++ b/src/mailparser/core.py @@ -27,7 +27,7 @@ import six import json -from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP +from mailparser.const import ADDRESS_HEADERS, ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP from mailparser.utils import ( convert_mail_date, @@ -587,9 +587,33 @@ def __getattr__(self, name): return json.dumps(raw, ensure_ascii=False) # object headers + elif name_header in ADDRESS_HEADERS: + h = decode_header_part(self.message.get(name_header, six.text_type())) + if h != "": + parsed_address = email.utils.parseaddr(h) + if parsed_address == ('',''): + defect = "Invalid {} header".format(name_header) + if defect not in self._defects: + self._defects.append(defect) + self._has_defects = True + parsed_address = h.split("<") + parsed_address = (parsed_address[0].strip(), + parsed_address[-1].strip(">")) + return parsed_address + elif name_header in ADDRESSES_HEADERS: h = decode_header_part(self.message.get(name_header, six.text_type())) - return email.utils.getaddresses([h]) + if h == "": + return [] + parsed_addresses = email.utils.getaddresses([h]) + if ('','') in parsed_addresses: + while ('','') in parsed_addresses: + parsed_addresses.remove(('','')) + defect = "Invalid {} header".format(name_header) + if defect not in self._defects: + self._defects.append(defect) + self._has_defects = True + return parsed_addresses # others headers else: diff --git a/src/mailparser/utils.py b/src/mailparser/utils.py index 5800c0a..6f44d62 100644 --- a/src/mailparser/utils.py +++ b/src/mailparser/utils.py @@ -42,6 +42,7 @@ import six from mailparser.const import ( + ADDRESS_HEADERS, ADDRESSES_HEADERS, JUNK_PATTERN, OTHERS_PARTS, @@ -519,10 +520,11 @@ def get_mail_keys(message, complete=True): if complete: log.debug("Get all headers") all_headers_keys = {i.lower() for i in message.keys()} - all_parts = ADDRESSES_HEADERS | OTHERS_PARTS | all_headers_keys + all_parts = ADDRESS_HEADERS | ADDRESSES_HEADERS | OTHERS_PARTS | \ + all_headers_keys else: log.debug("Get only mains headers") - all_parts = ADDRESSES_HEADERS | OTHERS_PARTS + all_parts = ADDRESS_HEADERS | ADDRESSES_HEADERS | OTHERS_PARTS log.debug("All parts to get: {}".format(", ".join(all_parts))) return all_parts From 2b79bb6129904eed7979f363fde0e76222fd410d Mon Sep 17 00:00:00 2001 From: Sean Whalen Date: Mon, 24 Mar 2025 08:32:09 -0400 Subject: [PATCH 2/4] Fix missing object attributes --- src/mailparser/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mailparser/core.py b/src/mailparser/core.py index 10794f1..2cded6a 100644 --- a/src/mailparser/core.py +++ b/src/mailparser/core.py @@ -601,7 +601,7 @@ def __getattr__(self, name): parsed_address[-1].strip(">")) return parsed_address - elif name_header in ADDRESSES_HEADERS: + if name_header in ADDRESSES_HEADERS: h = decode_header_part(self.message.get(name_header, six.text_type())) if h == "": return [] From e5231ceb912c8f425b410762ceb37aa576cea476 Mon Sep 17 00:00:00 2001 From: Sean Whalen Date: Mon, 24 Mar 2025 09:12:56 -0400 Subject: [PATCH 3/4] Fix address parsing --- src/mailparser/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mailparser/core.py b/src/mailparser/core.py index 2cded6a..3f2283f 100644 --- a/src/mailparser/core.py +++ b/src/mailparser/core.py @@ -599,9 +599,9 @@ def __getattr__(self, name): parsed_address = h.split("<") parsed_address = (parsed_address[0].strip(), parsed_address[-1].strip(">")) - return parsed_address + return parsed_address - if name_header in ADDRESSES_HEADERS: + elif name_header in ADDRESSES_HEADERS: h = decode_header_part(self.message.get(name_header, six.text_type())) if h == "": return [] From 01e0e52c480113cebfee65663664be9492014791 Mon Sep 17 00:00:00 2001 From: Sean Whalen Date: Mon, 24 Mar 2025 20:27:43 -0400 Subject: [PATCH 4/4] Remove quote marks from manually parsed display name --- src/mailparser/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mailparser/core.py b/src/mailparser/core.py index 3f2283f..6433b2b 100644 --- a/src/mailparser/core.py +++ b/src/mailparser/core.py @@ -597,7 +597,7 @@ def __getattr__(self, name): self._defects.append(defect) self._has_defects = True parsed_address = h.split("<") - parsed_address = (parsed_address[0].strip(), + parsed_address = (parsed_address[0].strip().strip('"'), parsed_address[-1].strip(">")) return parsed_address