[local-sync] Correctly handle messages with non-alternative multipart text bodies

Summary: It's possible to have multiple inline HTML parts in a message, or even a multipart/alternative part that contains text and HTML, followed by a plaintext signature. Previously, if there was more than one text part in an email, we would pick the _last_ text/html or text/plain part that we found, and treat that as the entire message body. This works most of the time, but fails to display the full message body in some edge cases. This patch fixes that by resolving multipart/alternative subparts to a single part in the mimepart fetch stage, and then treating each desired mime part separately when parsing the message, concatenating them if there are multiple. This makes K2's handling of multipart MIME message text better, bug-wise, than the Python sync engine's, which has been mangling some rare messages forever. (Example from my email: every email from the MIT EECS Jobs List has never displayed the mailing list signature in N1.) Note that this patch also removes our tentative support for PGP encrypted messages. I'd rather add that back in later when I've dug up some real example messages to test on, rather than leaving it in in its current not-really-tested and probably not-really-working state, since it makes it harder to make sure that the rest of the logic isn't broken. Test Plan: manual for now - added examples of this to my growing list of regression tests to add to the message parser unit tests once I fix them Reviewers: juan, evan Reviewed By: evan Differential Revision: https://phab.nylas.com/D3600
2024-09-22 08:16:09 +08:00 · 2017-01-05 17:18:18 -08:00 · 2017-01-05 17:18:18 -08:00 · 8238fe9594
parent 78aa3291d6
commit 8238fe9594
2 changed files with 112 additions and 80 deletions
--- a/packages/local-sync/src/local-sync-worker/imap/fetch-messages-in-folder.js
+++ b/packages/local-sync/src/local-sync-worker/imap/fetch-messages-in-folder.js
@ -158,24 +158,47 @@ class FetchMessagesInFolder extends SyncOperation {
    const desired = [];
    const available = [];
    const unseen = [struct];
-    const ignoreTypes = new Set(['alternative', 'mixed', 'signed']);
-    const desiredTypes = new Set(['text/plain', 'text/html', 'application/pgp-encrypted']);
+    const desiredTypes = new Set(['text/plain', 'text/html']);
+    // MIME structures can be REALLY FREAKING COMPLICATED. To simplify
+    // processing, we flatten the MIME structure by walking it depth-first,
+    // throwing away all multipart headers with the exception of
+    // multipart/alternative trees. We special case these, flattening via a
+    // recursive call and then extracting only HTML parts, since their
+    // equivalent nature allows us to pick our desired representation and throw
+    // away the rest.
    while (unseen.length > 0) {
      const part = unseen.shift();
-      if (part instanceof Array) {
+      if (part instanceof Array && (part[0].type !== 'alternative')) {
        unseen.unshift(...part);
-      } else if (!ignoreTypes.has(part.type)) {
-        const mimeType = `${part.type}/${part.subtype}`;
-        available.push(mimeType);
-        const disposition = part.disposition ? part.disposition.type.toLowerCase() : null;
-        if (desiredTypes.has(mimeType) && (disposition !== 'attachment')) {
-          desired.push({
-            id: part.partID,
-            // encoding and charset may be null
-            transferEncoding: part.encoding,
-            charset: part.params ? part.params.charset : null,
-            mimeType,
-          });
+      } else if (part instanceof Array && (part[0].type === 'alternative')) {
+        // Picking our desired alternative part(s) here vastly simplifies
+        // later parsing of the body, since we can then completely ignore
+        // mime structure without making any terrible mistakes. We assume
+        // here that all multipart/alternative MIME parts are arrays of
+        // text/plain vs text/html, which is ~always true (and if it isn't,
+        // the message is bound to be absurd in other ways and we can't
+        // guarantee sensible display).
+        part.shift();
+        const htmlParts = this._getDesiredMIMEParts(part).filter((p) => {
+          return p.mimeType === 'text/html';
+        });
+        if (htmlParts.length > 0) {
+          desired.push(...htmlParts);
+        }
+      } else {
+        if (part.size) { // will skip all multipart types
+          const mimeType = `${part.type}/${part.subtype}`;
+          available.push(mimeType);
+          const disposition = part.disposition ? part.disposition.type.toLowerCase() : null;
+          if (desiredTypes.has(mimeType) && (disposition !== 'attachment')) {
+            desired.push({
+              id: part.partID,
+              // encoding and charset may be null
+              transferEncoding: part.encoding,
+              charset: part.params ? part.params.charset : null,
+              mimeType,
+            });
+          }
        }
      }
      // attachment metadata is extracted later---ignore for now
--- a/packages/local-sync/src/shared/message-factory.js
+++ b/packages/local-sync/src/shared/message-factory.js
@ -43,46 +43,42 @@ function extractContacts(input) {
 }


-function extractSnippet(plainBody, htmlBody) {
-  let snippetText = plainBody ? plainBody.trim() : '';
-  if (htmlBody) {
-    const doc = new DOMParser().parseFromString(htmlBody, 'text/html')
-    const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']);
-    const noSpaceTags = new Set(['B', 'I', 'STRONG', 'EM', 'SPAN']);
+function extractSnippet(body) {
+  const doc = new DOMParser().parseFromString(body, 'text/html')
+  const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']);
+  const noSpaceTags = new Set(['B', 'I', 'STRONG', 'EM', 'SPAN']);

-    const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => {
-      if (skipTags.has(node.tagName)) {
-        // skip this node and all its children
-        return NodeFilter.FILTER_REJECT;
+  const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => {
+    if (skipTags.has(node.tagName)) {
+      // skip this node and all its children
+      return NodeFilter.FILTER_REJECT;
+    }
+    if (node.nodeType === Node.TEXT_NODE) {
+      const nodeValue = node.nodeValue ? node.nodeValue.trim() : null;
+      if (nodeValue) {
+        return NodeFilter.FILTER_ACCEPT;
      }
-      if (node.nodeType === Node.TEXT_NODE) {
-        const nodeValue = node.nodeValue ? node.nodeValue.trim() : null;
-        if (nodeValue) {
-          return NodeFilter.FILTER_ACCEPT;
-        }
-        return NodeFilter.FILTER_SKIP;
-      }
-      return NodeFilter.FILTER_ACCEPT;
-    });
+      return NodeFilter.FILTER_SKIP;
+    }
+    return NodeFilter.FILTER_ACCEPT;
+  });

-    let extractedText = "";
-    let lastNodeTag = "";
-    while (treeWalker.nextNode()) {
-      if (treeWalker.currentNode.nodeType === Node.ELEMENT_NODE) {
-        lastNodeTag = treeWalker.currentNode.nodeName;
-      } else {
-        if (extractedText && !noSpaceTags.has(lastNodeTag)) {
-          extractedText += " ";
-        }
-        extractedText += treeWalker.currentNode.nodeValue;
-        if (extractedText.length > SNIPPET_MAX_SIZE) {
-          break;
-        }
+  let extractedText = "";
+  let lastNodeTag = "";
+  while (treeWalker.nextNode()) {
+    if (treeWalker.currentNode.nodeType === Node.ELEMENT_NODE) {
+      lastNodeTag = treeWalker.currentNode.nodeName;
+    } else {
+      if (extractedText && !noSpaceTags.has(lastNodeTag)) {
+        extractedText += " ";
+      }
+      extractedText += treeWalker.currentNode.nodeValue;
+      if (extractedText.length > SNIPPET_MAX_SIZE) {
+        break;
      }
    }
-
-    snippetText = extractedText.trim();
  }
+  const snippetText = extractedText.trim();

  // clean up and trim snippet
  let trimmed = snippetText.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE);
@ -164,33 +160,54 @@ function getReplyHeaders(messageReplyingTo) {
  return {inReplyTo, references}
 }

+function bodyFromParts(imapMessage, desiredParts) {
+  let body = '';
+  for (const {id, mimeType, transferEncoding, charset} of desiredParts) {
+    let decoded = '';
+    // see https://www.w3.org/Protocols/rfc1341/5_Content-Transfer-Encoding.html
+    if (!transferEncoding || new Set(['7bit', '8bit', 'binary']).has(transferEncoding.toLowerCase())) {
+      // NO transfer encoding has been performed --- how to decode to a string
+      // depends ONLY on the charset, which defaults to 'ascii' according to
+      // https://tools.ietf.org/html/rfc2045#section-5.2
+      decoded = encoding.convert(imapMessage.parts[id], 'utf-8', charset || 'ascii').toString('utf-8');
+    } else if (transferEncoding.toLowerCase() === 'quoted-printable') {
+      decoded = mimelib.decodeQuotedPrintable(imapMessage.parts[id], charset || 'ascii');
+    } else if (transferEncoding.toLowerCase() === 'base64') {
+      decoded = mimelib.decodeBase64(imapMessage.parts[id], charset || 'ascii');
+    } else {
+      // custom x-token content-transfer-encodings
+      return Promise.reject(new Error(`Unsupported Content-Transfer-Encoding ${transferEncoding}, mimetype ${mimeType}`))
+    }
+    // desiredParts are in order of the MIME tree walk, e.g. 1.1, 1.2, 2...,
+    // and for multipart/alternative arrays, we have already pulled out the
+    // highest fidelity part (generally HTML).
+    //
+    // Therefore, the correct way to display multiple parts is to simply
+    // concatenate later ones with the body of the previous MIME parts.
+    //
+    // This may seem kind of weird, but some MUAs _do_ send out whack stuff
+    // like an HTML body followed by a plaintext footer.
+    if (mimeType === 'text/plain') {
+      body += htmlifyPlaintext(decoded);
+    } else {
+      body += decoded;
+    }
+  }
+  // sometimes decoding results in a NUL-terminated body string, which makes
+  // SQLite blow up with an 'unrecognized token' error
+  body = body.replace(/\0/g, '');
+
+  return body;
+}

 // Since we only fetch the MIME structure and specific desired MIME parts from
 // IMAP, we unfortunately can't use an existing library like mailparser to parse
 // the message, and have to do fun stuff like deal with character sets and
 // content-transfer-encodings ourselves.
 async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder}) {
-  const {Message, Label} = db
-  const {attributes} = imapMessage
+  const {Message, Label} = db;
+  const {attributes} = imapMessage;

-  const body = {}
-  for (const {id, mimeType, transferEncoding, charset} of desiredParts) {
-    // see https://www.w3.org/Protocols/rfc1341/5_Content-Transfer-Encoding.html
-    if (!transferEncoding || new Set(['7bit', '8bit', 'binary']).has(transferEncoding.toLowerCase())) {
-      // NO transfer encoding has been performed --- how to decode to a string
-      // depends ONLY on the charset, which defaults to 'ascii' according to
-      // https://tools.ietf.org/html/rfc2045#section-5.2
-      const convertedBuffer = encoding.convert(imapMessage.parts[id], 'utf-8', charset || 'ascii')
-      body[mimeType] = convertedBuffer.toString('utf-8');
-    } else if (transferEncoding.toLowerCase() === 'quoted-printable') {
-      body[mimeType] = mimelib.decodeQuotedPrintable(imapMessage.parts[id], charset || 'ascii');
-    } else if (transferEncoding.toLowerCase() === 'base64') {
-      body[mimeType] = mimelib.decodeBase64(imapMessage.parts[id], charset || 'ascii');
-    } else {
-      // custom x-token content-transfer-encodings
-      return Promise.reject(new Error(`Unsupported Content-Transfer-Encoding ${transferEncoding}, mimetype ${mimeType}`))
-    }
-  }
  const headers = imapMessage.headers.toString('ascii');
  const parsedHeaders = mimelib.parseHeaders(headers);
  for (const key of ['x-gm-thrid', 'x-gm-msgid', 'x-gm-labels']) {
@ -204,7 +221,7 @@ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder})
    from: extractContacts(parsedHeaders.from),
    replyTo: extractContacts(parsedHeaders['reply-to']),
    accountId: accountId,
-    body: body['text/html'] || body['text/plain'] || body['application/pgp-encrypted'] || '',
+    body: bodyFromParts(imapMessage, desiredParts),
    snippet: null,
    unread: !attributes.flags.includes('\\Seen'),
    starred: attributes.flags.includes('\\Flagged'),
@ -232,16 +249,8 @@ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder})
  parsedMessage.id = Message.hash(parsedMessage)
  parsedMessage.date = new Date(Date.parse(parsedMessage.date))

-  // sometimes decoding results in a NUL-terminated body string, which makes
-  // SQLite blow up with an 'unrecognized token' error
-  parsedMessage.body = parsedMessage.body.replace(/\0/g, '');
-
-  if (!body['text/html'] && body['text/plain']) {
-    parsedMessage.body = htmlifyPlaintext(body['text/plain']);
-  }
-
-  parsedMessage.snippet = extractSnippet(body['text/plain'], body['text/html']);
-  parsedMessage.folder = folder
+  parsedMessage.snippet = extractSnippet(parsedMessage.body);
+  parsedMessage.folder = folder;

  // TODO: unclear if this is necessary given we already have parsed labels
  const xGmLabels = attributes['x-gm-labels']