Merge branch 'release/3.15.0'

SpamScope · Feb 26, 2021 · 59442ba · 59442ba
2 parents 9791b2d + 0abd896
commit 59442ba
Show file tree

Hide file tree

Showing 10 changed files with 111 additions and 59 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,9 +5,9 @@ language: python
 
 python:
     - "2.7"
-    - "3.4"
-    - "3.5"
-    - "3.6"
+    - "3.7"
+    - "3.8"
+    - "3.9"
 
 before_install:
     - sudo apt-get -qq update

diff --git a/README.md b/README.md
@@ -8,8 +8,6 @@
 
 # mail-parser
 
-## Overview
-
 mail-parser is not only a wrapper for [email](https://docs.python.org/2/library/email.message.html) Python Standard Library.
 It give you an easy way to pass from raw mail to Python object that you can use in your code.
 It's the key module of [SpamScope](https://github.com/SpamScope/spamscope).
@@ -28,15 +26,29 @@ $ apt-cache show libemail-outlook-message-perl
 
 mail-parser supports Python 3.
 
-## mail-parser on Web
+
+# Apache 2 Open Source License
+mail-parser can be downloaded, used, and modified free of charge. It is available under the Apache 2 license.
+
+If you want support the project:
+
+
+[![Donate](https://www.paypal.com/en_US/i/btn/btn_donateCC_LG.gif "Donate")](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=VEPXYP745KJF2)
+
+![Bitcoin Donate](https://i.stack.imgur.com/MnQ6V.png)
+
+![](https://github.com/SpamScope/mail-parser/raw/develop/docs/bitcoin-qrcode.png)
+
+
+# mail-parser on Web
  - [Splunk app](https://splunkbase.splunk.com/app/4129/)
  - [FreeBSD port](https://www.freshports.org/mail/py-mail-parser/)
  - [Arch User Repository](https://aur.archlinux.org/packages/mailparser/)
 
 
-## Description
+# Description
 
-mail-parser takes as input a raw email and generates a parsed object. The properties of this object are the same name of 
+mail-parser takes as input a raw email and generates a parsed object. The properties of this object are the same name of
 [RFC headers](https://www.iana.org/assignments/message-headers/message-headers.xhtml):
 
   - bcc
@@ -107,27 +119,18 @@ $ mail.to_raw (raw header)
 
 The command line tool use the JSON format.
 
-### Defects
+## Defects
 These defects can be used to evade the antispam filter. An example are the mails with a malformed boundary that can hide a not legitimate epilogue (often malware).
 This library can take these epilogues.
 
 
-### Apache 2 Open Source License
-mail-parser can be downloaded, used, and modified free of charge. It is available under the Apache 2 license.
-
-If you want support the project:
-
-
-[![Donate](https://www.paypal.com/en_US/i/btn/btn_donateCC_LG.gif "Donate")](https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=VEPXYP745KJF2)
-
-
-## Authors
+# Authors
 
-### Main Author
+## Main Author
 **Fedele Mantuano**: [LinkedIn](https://www.linkedin.com/in/fmantuano/)
 
 
-## Installation
+# Installation
 
 Clone repository
 
@@ -149,7 +152,7 @@ or use `pip`:
 $ pip install mail-parser
 ```
 
-## Usage in a project
+# Usage in a project
 
 Import `mailparser` module:
 
@@ -196,7 +199,7 @@ It's possible to write the attachments on disk with the method:
 mail.write_attachments(base_path)
 ```
 
-## Usage from command-line
+# Usage from command-line
 
 If you installed mailparser with `pip` or `setup.py` you can use it with command-line.
 
@@ -216,7 +219,7 @@ optional arguments:
   -s STRING, --string STRING
                         Raw email string (default: None)
   -k, --stdin           Enable parsing from stdin (default: False)
-  -l {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET}, --log-level {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET}                                                                                          
+  -l {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET}, --log-level {CRITICAL,ERROR,WARNING,INFO,DEBUG,NOTSET}
                         Set log level (default: WARNING)
   -j, --json            Show the JSON of parsed mail (default: False)
   -b, --body            Print the body of mail (default: False)
@@ -253,11 +256,11 @@ $ mailparser -f example_mail -j
 
 This example will show you the tokenized mail in a JSON pretty format.
 
-From [raw mail](https://gist.github.com/fedelemantuano/5dd702004c25a46b2bd60de21e67458e) to 
+From [raw mail](https://gist.github.com/fedelemantuano/5dd702004c25a46b2bd60de21e67458e) to
 [parsed mail](https://gist.github.com/fedelemantuano/e958aa2813c898db9d2d09469db8e6f6).
 
 
-## Exceptions
+# Exceptions
 
 Exceptions hierarchy of mail-parser:
 

diff --git a/docs/bitcoin-qrcode.png b/docs/bitcoin-qrcode.png
diff --git a/mailparser/const.py b/mailparser/const.py
@@ -65,16 +65,18 @@
         r'envelope-from|\s*[(]?envelope-sender|\s+'
         r'from|\s+by|\s+id|\s+for|\s+with(?! cipher)|;))'
     ),
-
     # assumes emails are always inside <>
     r'(?:envelope-from\s+<(?P<envelope_from>.+?)>)',
     r'(?:envelope-sender\s+<(?P<envelope_sender>.+?)>)',
 
     # datetime comes after ; at the end
     r';\s*(?P<date>.*)',
-    
+
     # sendgrid datetime
-    r'(?P<date>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+'
+    (
+        r'(?P<date>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:'
+        r'\d{2}\.\d{9} \+0000 UTC) m=\+\d+\.\d+'
+    )
 ]
 
 RECEIVED_COMPILED_LIST = [

diff --git a/mailparser/mailparser.py b/mailparser/mailparser.py
@@ -42,6 +42,7 @@
     msgconvert,
     ported_open,
     ported_string,
+    random_string,
     receiveds_parsing,
     write_attachments,
 )
@@ -353,14 +354,31 @@ def parse(self):
                 charset = p.get_content_charset('utf-8')
                 charset_raw = p.get_content_charset()
                 log.debug("Charset {!r} part {!r}".format(charset, i))
+                content_disposition = ported_string(
+                    p.get('content-disposition'))
+                log.debug("content-disposition {!r} part {!r}".format(
+                    content_disposition, i))
                 content_id = ported_string(p.get('content-id'))
                 log.debug("content-id {!r} part {!r}".format(
                     content_id, i))
-                filename = decode_header_part(
-                    p.get_filename("{}".format(content_id)))
+                content_subtype = ported_string(p.get_content_subtype())
+                log.debug("content subtype {!r} part {!r}".format(
+                    content_subtype, i))
+                filename = decode_header_part(p.get_filename())
 
-                # this is an attachment
+                is_attachment = False
                 if filename:
+                    is_attachment = True
+                else:
+                    if content_id and content_subtype not in ('html', 'plain'):
+                        is_attachment = True
+                        filename = content_id
+                    elif content_subtype in ('rtf'):
+                        is_attachment = True
+                        filename = "{}.rtf".format(random_string())
+
+                # this is an attachment
+                if is_attachment:
                     log.debug("Email part {!r} is an attachment".format(i))
                     log.debug("Filename {!r} part {!r}".format(filename, i))
                     binary = False
@@ -412,8 +430,23 @@ def parse(self):
                 # this isn't an attachments
                 else:
                     log.debug("Email part {!r} is not an attachment".format(i))
-                    payload = ported_string(
-                        p.get_payload(decode=True), encoding=charset)
+
+                    # Get the payload using get_payload method with decode=True
+                    # As Python truly decodes only 'base64',
+                    # 'quoted-printable', 'x-uuencode',
+                    # 'uuencode', 'uue', 'x-uue'
+                    # And for other encodings it breaks the characters so
+                    # we need to decode them with encoding python is appying
+                    # To maintain the characters
+                    payload = p.get_payload(decode=True)
+                    cte = p.get('Content-Transfer-Encoding')
+                    if cte:
+                        cte = cte.lower()
+                    if not cte or cte in ['7bit', '8bit']:
+                        payload = payload.decode('raw-unicode-escape')
+                    else:
+                        payload = ported_string(payload, encoding=charset)
+
                     if payload:
                         if p.get_content_subtype() == 'html':
                             self._text_html.append(payload)

diff --git a/mailparser/utils.py b/mailparser/utils.py
@@ -101,19 +101,19 @@ def ported_string(raw_data, encoding='utf-8', errors='ignore'):
         return six.text_type()
 
     if isinstance(raw_data, six.text_type):
-        return raw_data.strip()
+        return raw_data
 
     if six.PY2:
         try:
-            return six.text_type(raw_data, encoding, errors).strip()
+            return six.text_type(raw_data, encoding, errors)
         except LookupError:
-            return six.text_type(raw_data, "utf-8", errors).strip()
+            return six.text_type(raw_data, "utf-8", errors)
 
     if six.PY3:
         try:
-            return six.text_type(raw_data, encoding).strip()
+            return six.text_type(raw_data, encoding)
         except (LookupError, UnicodeDecodeError):
-            return six.text_type(raw_data, "utf-8", errors).strip()
+            return six.text_type(raw_data, "utf-8", errors)
 
 
 def decode_header_part(header):
@@ -141,7 +141,7 @@ def decode_header_part(header):
         log.error("Failed decoding header part: {}".format(header))
         output += header
 
-    return output
+    return output.strip()
 
 
 def ported_open(file_):
@@ -290,7 +290,23 @@ def parse_received(received):
     if len(values_by_clause) == 0:
         # we weren't able to match anything...
         msg = "Unable to match any clauses in %s" % (received)
-        log.error(msg)
+
+        # Modification #1: Commenting the following log as
+        # this raised exception is caught above and then
+        # raw header is updated in response
+        # We dont want to get so many errors in our error
+        # logger as we are not even trying to parse the
+        # received headers
+        # Wanted to make it configurable via settiings,
+        # but this package does not depend on django and
+        # making configurable setting
+        # will make it django dependent,
+        # so better to keep it working with only python
+        # dependent and on any framework of python
+        # commenting it just for our use
+
+        # log.error(msg)
+
         raise MailParserReceivedParsingError(msg)
     return values_by_clause
 
@@ -468,7 +484,7 @@ def get_header(message, name):
         headers = [decode_header_part(i) for i in headers]
         if len(headers) == 1:
             # in this case return a string
-            return headers[0]
+            return headers[0].strip()
         # in this case return a list
         return headers
     return six.text_type()
@@ -551,7 +567,6 @@ def write_sample(binary, payload, path, filename):  # pragma: no cover
     """
     if not os.path.exists(path):
         os.makedirs(path)
-
     sample = os.path.join(path, filename)
 
     if binary:

diff --git a/mailparser/version.py b/mailparser/version.py
@@ -17,7 +17,7 @@
 limitations under the License.
 """
 
-__version__ = "3.14.0"
+__version__ = "3.15.0"
 
 if __name__ == "__main__":
     print(__version__)
diff --git a/setup.py b/setup.py
@@ -64,6 +64,8 @@
         "Programming Language :: Python :: 3.5",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
     ],
     install_requires=requires,
     entry_points={'console_scripts': [

diff --git a/tests/test_mail_parser.py b/tests/test_mail_parser.py
@@ -200,14 +200,14 @@ def test_fingerprints_body(self):
         mail = mailparser.parse_from_file(mail_test_1)
         md5, sha1, sha256, sha512 = fingerprints(
             mail.body.encode("utf-8"))
-        self.assertEqual(md5, "1bbdb7dcf511113bbc0c1b214aeac392")
-        self.assertEqual(sha1, "ce9e62b50fa4e2168278880b14460b905b24eb4b")
-        self.assertEqual(sha256, ("1e9b96e3f1bc74702f9703391e8ba0715b849"
-                                  "7127a7ff857013ab33385898574"))
-        self.assertEqual(sha512, ("ad858f7b5ec5549e55650fd13df7683e403489"
-                                  "77522995851fb6b625ac54744cf3a4bf652784"
-                                  "dba971ef99afeec4e6caf2fdd10be72eabb730"
-                                  "c312ffbe1c4de3"))
+        self.assertEqual(md5, "55852a2efe95e7249887c92cc02123f8")
+        self.assertEqual(sha1, "62fef1e38327ed09363624c3aff8ea11723ee05f")
+        self.assertEqual(sha256, ("cd4af1017f2e623f6d38f691048b6"
+                                  "a28d8b1f44a0478137b4337eac6de78f71a"))
+        self.assertEqual(sha512, ("4a573c7929b078f2a2c1c0f869d418b0c020d4"
+                                  "d37196bd6dcc209f9ccb29ca67355aa5e47b97"
+                                  "c8bf90377204f59efde7ba1fc071b6f250a665"
+                                  "72f63b997e92e8"))
 
     def test_fingerprints_unicodeencodeerror(self):
         mail = mailparser.parse_from_file(mail_test_7)
@@ -456,7 +456,7 @@ def test_parse_from_file_msg(self):
         m = mailparser.parse_from_file_msg(mail_outlook_1)
         email = m.mail
         self.assertIn("attachments", email)
-        self.assertEqual(len(email["attachments"]), 5)
+        self.assertEqual(len(email["attachments"]), 6)
         self.assertIn("from", email)
         self.assertEqual(email["from"][0][1], "[email protected]")
         self.assertIn("subject", email)
@@ -564,11 +564,7 @@ def test_ported_string(self):
         s = ported_string(raw_data)
         self.assertEqual(s, six.text_type())
 
-        raw_data = "test "
-        s = ported_string(raw_data)
-        self.assertEqual(s, "test")
-
-        raw_data = u"test "
+        raw_data = u"test"
         s = ported_string(raw_data)
         self.assertEqual(s, "test")
 
@@ -671,5 +667,6 @@ def test_write_uuencode_attachment(self):
         shutil.rmtree(temp_dir)
         self.assertEqual(md5.hexdigest(), '4f2cf891e7cfb349fca812091f184ecc')
 
+
 if __name__ == '__main__':
     unittest.main(verbosity=2)
diff --git a/tox.ini b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = begin, py27, py37, end
+envlist = begin, py27, py39, end
 
 [testenv:begin]
 commands = coverage erase