[local-sync] Correctly handle messages with non-alternative multipart text bodies

Summary:
It's possible to have multiple inline HTML parts in a message, or even
a multipart/alternative part that contains text and HTML, followed by a
plaintext signature. Previously, if there was more than one text part in
an email, we would pick the _last_ text/html or text/plain part that we
found, and treat that as the entire message body. This works most of the
time, but fails to display the full message body in some edge cases.
This patch fixes that by resolving multipart/alternative subparts to a
single part in the mimepart fetch stage, and then treating each desired
mime part separately when parsing the message, concatenating them if
there are multiple.

This makes K2's handling of multipart MIME message text better,
bug-wise, than the Python sync engine's, which has been mangling some
rare messages forever. (Example from my email: every email from the MIT
EECS Jobs List has never displayed the mailing list signature in N1.)

Note that this patch also removes our tentative support for PGP
encrypted messages. I'd rather add that back in later when I've dug up
some real example messages to test on, rather than leaving it in in its
current not-really-tested and probably not-really-working state, since
it makes it harder to make sure that the rest of the logic isn't broken.

Test Plan: manual for now - added examples of this to my growing list of regression tests to add to the message parser unit tests once I fix them

Reviewers: juan, evan

Reviewed By: evan

Differential Revision: https://phab.nylas.com/D3600
This commit is contained in:
Christine Spang 2017-01-05 17:18:18 -08:00
parent 78aa3291d6
commit 8238fe9594
2 changed files with 112 additions and 80 deletions

View file

@ -158,24 +158,47 @@ class FetchMessagesInFolder extends SyncOperation {
const desired = [];
const available = [];
const unseen = [struct];
const ignoreTypes = new Set(['alternative', 'mixed', 'signed']);
const desiredTypes = new Set(['text/plain', 'text/html', 'application/pgp-encrypted']);
const desiredTypes = new Set(['text/plain', 'text/html']);
// MIME structures can be REALLY FREAKING COMPLICATED. To simplify
// processing, we flatten the MIME structure by walking it depth-first,
// throwing away all multipart headers with the exception of
// multipart/alternative trees. We special case these, flattening via a
// recursive call and then extracting only HTML parts, since their
// equivalent nature allows us to pick our desired representation and throw
// away the rest.
while (unseen.length > 0) {
const part = unseen.shift();
if (part instanceof Array) {
if (part instanceof Array && (part[0].type !== 'alternative')) {
unseen.unshift(...part);
} else if (!ignoreTypes.has(part.type)) {
const mimeType = `${part.type}/${part.subtype}`;
available.push(mimeType);
const disposition = part.disposition ? part.disposition.type.toLowerCase() : null;
if (desiredTypes.has(mimeType) && (disposition !== 'attachment')) {
desired.push({
id: part.partID,
// encoding and charset may be null
transferEncoding: part.encoding,
charset: part.params ? part.params.charset : null,
mimeType,
});
} else if (part instanceof Array && (part[0].type === 'alternative')) {
// Picking our desired alternative part(s) here vastly simplifies
// later parsing of the body, since we can then completely ignore
// mime structure without making any terrible mistakes. We assume
// here that all multipart/alternative MIME parts are arrays of
// text/plain vs text/html, which is ~always true (and if it isn't,
// the message is bound to be absurd in other ways and we can't
// guarantee sensible display).
part.shift();
const htmlParts = this._getDesiredMIMEParts(part).filter((p) => {
return p.mimeType === 'text/html';
});
if (htmlParts.length > 0) {
desired.push(...htmlParts);
}
} else {
if (part.size) { // will skip all multipart types
const mimeType = `${part.type}/${part.subtype}`;
available.push(mimeType);
const disposition = part.disposition ? part.disposition.type.toLowerCase() : null;
if (desiredTypes.has(mimeType) && (disposition !== 'attachment')) {
desired.push({
id: part.partID,
// encoding and charset may be null
transferEncoding: part.encoding,
charset: part.params ? part.params.charset : null,
mimeType,
});
}
}
}
// attachment metadata is extracted later---ignore for now

View file

@ -43,46 +43,42 @@ function extractContacts(input) {
}
function extractSnippet(plainBody, htmlBody) {
let snippetText = plainBody ? plainBody.trim() : '';
if (htmlBody) {
const doc = new DOMParser().parseFromString(htmlBody, 'text/html')
const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']);
const noSpaceTags = new Set(['B', 'I', 'STRONG', 'EM', 'SPAN']);
function extractSnippet(body) {
const doc = new DOMParser().parseFromString(body, 'text/html')
const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']);
const noSpaceTags = new Set(['B', 'I', 'STRONG', 'EM', 'SPAN']);
const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => {
if (skipTags.has(node.tagName)) {
// skip this node and all its children
return NodeFilter.FILTER_REJECT;
const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => {
if (skipTags.has(node.tagName)) {
// skip this node and all its children
return NodeFilter.FILTER_REJECT;
}
if (node.nodeType === Node.TEXT_NODE) {
const nodeValue = node.nodeValue ? node.nodeValue.trim() : null;
if (nodeValue) {
return NodeFilter.FILTER_ACCEPT;
}
if (node.nodeType === Node.TEXT_NODE) {
const nodeValue = node.nodeValue ? node.nodeValue.trim() : null;
if (nodeValue) {
return NodeFilter.FILTER_ACCEPT;
}
return NodeFilter.FILTER_SKIP;
}
return NodeFilter.FILTER_ACCEPT;
});
return NodeFilter.FILTER_SKIP;
}
return NodeFilter.FILTER_ACCEPT;
});
let extractedText = "";
let lastNodeTag = "";
while (treeWalker.nextNode()) {
if (treeWalker.currentNode.nodeType === Node.ELEMENT_NODE) {
lastNodeTag = treeWalker.currentNode.nodeName;
} else {
if (extractedText && !noSpaceTags.has(lastNodeTag)) {
extractedText += " ";
}
extractedText += treeWalker.currentNode.nodeValue;
if (extractedText.length > SNIPPET_MAX_SIZE) {
break;
}
let extractedText = "";
let lastNodeTag = "";
while (treeWalker.nextNode()) {
if (treeWalker.currentNode.nodeType === Node.ELEMENT_NODE) {
lastNodeTag = treeWalker.currentNode.nodeName;
} else {
if (extractedText && !noSpaceTags.has(lastNodeTag)) {
extractedText += " ";
}
extractedText += treeWalker.currentNode.nodeValue;
if (extractedText.length > SNIPPET_MAX_SIZE) {
break;
}
}
snippetText = extractedText.trim();
}
const snippetText = extractedText.trim();
// clean up and trim snippet
let trimmed = snippetText.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE);
@ -164,33 +160,54 @@ function getReplyHeaders(messageReplyingTo) {
return {inReplyTo, references}
}
function bodyFromParts(imapMessage, desiredParts) {
let body = '';
for (const {id, mimeType, transferEncoding, charset} of desiredParts) {
let decoded = '';
// see https://www.w3.org/Protocols/rfc1341/5_Content-Transfer-Encoding.html
if (!transferEncoding || new Set(['7bit', '8bit', 'binary']).has(transferEncoding.toLowerCase())) {
// NO transfer encoding has been performed --- how to decode to a string
// depends ONLY on the charset, which defaults to 'ascii' according to
// https://tools.ietf.org/html/rfc2045#section-5.2
decoded = encoding.convert(imapMessage.parts[id], 'utf-8', charset || 'ascii').toString('utf-8');
} else if (transferEncoding.toLowerCase() === 'quoted-printable') {
decoded = mimelib.decodeQuotedPrintable(imapMessage.parts[id], charset || 'ascii');
} else if (transferEncoding.toLowerCase() === 'base64') {
decoded = mimelib.decodeBase64(imapMessage.parts[id], charset || 'ascii');
} else {
// custom x-token content-transfer-encodings
return Promise.reject(new Error(`Unsupported Content-Transfer-Encoding ${transferEncoding}, mimetype ${mimeType}`))
}
// desiredParts are in order of the MIME tree walk, e.g. 1.1, 1.2, 2...,
// and for multipart/alternative arrays, we have already pulled out the
// highest fidelity part (generally HTML).
//
// Therefore, the correct way to display multiple parts is to simply
// concatenate later ones with the body of the previous MIME parts.
//
// This may seem kind of weird, but some MUAs _do_ send out whack stuff
// like an HTML body followed by a plaintext footer.
if (mimeType === 'text/plain') {
body += htmlifyPlaintext(decoded);
} else {
body += decoded;
}
}
// sometimes decoding results in a NUL-terminated body string, which makes
// SQLite blow up with an 'unrecognized token' error
body = body.replace(/\0/g, '');
return body;
}
// Since we only fetch the MIME structure and specific desired MIME parts from
// IMAP, we unfortunately can't use an existing library like mailparser to parse
// the message, and have to do fun stuff like deal with character sets and
// content-transfer-encodings ourselves.
async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder}) {
const {Message, Label} = db
const {attributes} = imapMessage
const {Message, Label} = db;
const {attributes} = imapMessage;
const body = {}
for (const {id, mimeType, transferEncoding, charset} of desiredParts) {
// see https://www.w3.org/Protocols/rfc1341/5_Content-Transfer-Encoding.html
if (!transferEncoding || new Set(['7bit', '8bit', 'binary']).has(transferEncoding.toLowerCase())) {
// NO transfer encoding has been performed --- how to decode to a string
// depends ONLY on the charset, which defaults to 'ascii' according to
// https://tools.ietf.org/html/rfc2045#section-5.2
const convertedBuffer = encoding.convert(imapMessage.parts[id], 'utf-8', charset || 'ascii')
body[mimeType] = convertedBuffer.toString('utf-8');
} else if (transferEncoding.toLowerCase() === 'quoted-printable') {
body[mimeType] = mimelib.decodeQuotedPrintable(imapMessage.parts[id], charset || 'ascii');
} else if (transferEncoding.toLowerCase() === 'base64') {
body[mimeType] = mimelib.decodeBase64(imapMessage.parts[id], charset || 'ascii');
} else {
// custom x-token content-transfer-encodings
return Promise.reject(new Error(`Unsupported Content-Transfer-Encoding ${transferEncoding}, mimetype ${mimeType}`))
}
}
const headers = imapMessage.headers.toString('ascii');
const parsedHeaders = mimelib.parseHeaders(headers);
for (const key of ['x-gm-thrid', 'x-gm-msgid', 'x-gm-labels']) {
@ -204,7 +221,7 @@ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder})
from: extractContacts(parsedHeaders.from),
replyTo: extractContacts(parsedHeaders['reply-to']),
accountId: accountId,
body: body['text/html'] || body['text/plain'] || body['application/pgp-encrypted'] || '',
body: bodyFromParts(imapMessage, desiredParts),
snippet: null,
unread: !attributes.flags.includes('\\Seen'),
starred: attributes.flags.includes('\\Flagged'),
@ -232,16 +249,8 @@ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder})
parsedMessage.id = Message.hash(parsedMessage)
parsedMessage.date = new Date(Date.parse(parsedMessage.date))
// sometimes decoding results in a NUL-terminated body string, which makes
// SQLite blow up with an 'unrecognized token' error
parsedMessage.body = parsedMessage.body.replace(/\0/g, '');
if (!body['text/html'] && body['text/plain']) {
parsedMessage.body = htmlifyPlaintext(body['text/plain']);
}
parsedMessage.snippet = extractSnippet(body['text/plain'], body['text/html']);
parsedMessage.folder = folder
parsedMessage.snippet = extractSnippet(parsedMessage.body);
parsedMessage.folder = folder;
// TODO: unclear if this is necessary given we already have parsed labels
const xGmLabels = attributes['x-gm-labels']