fix(local-sync): Fix charset interpretation in message parsing

Summary:
This commit fixes the following bugs in message parsing:
- we were unilaterally decoding MIME bodies as UTF-8; instead, decode according
  to the charset data in the mimepart header
- '7bit' content-transfer-encoding means us-ascii, NOT utf-7
- only interpret valid content-transfer-encodings (previously we were trying
  to treat various charsets as transfer-encodings)
- clearer naming: s/values/parsedMessage/
- unify snippet cleanup between plaintext & stripped HTML (merging
  whitespace etc.)

Test Plan: units tests coming

Reviewers: juan

Differential Revision: https://phab.nylas.com/D3491
This commit is contained in:
Christine Spang 2016-12-08 18:30:57 -08:00
parent 162dbbd141
commit 587f7787a6
4 changed files with 66 additions and 39 deletions

View file

@ -68,7 +68,7 @@ class IMAPBox {
});
stream.once('end', () => {
const full = Buffer.concat(chunks).toString('utf8');
const full = Buffer.concat(chunks);
if (info.which === 'HEADER') {
headers = full;
} else {
@ -77,6 +77,10 @@ class IMAPBox {
});
});
imapMessage.once('end', () => {
// attributes is an object containing ascii strings, but parts and
// headers are undecoded binary Buffers (since the data for mime
// parts cannot be decoded to strings without looking up charset data
// in metadata, and this function's job is only to fetch the raw data)
forEachMessageCallback({attributes, headers, parts});
});
})

View file

@ -16,16 +16,14 @@
"joi": "8.4.2",
"mimelib": "0.2.19",
"nodemailer": "2.5.0",
"quoted-printable": "1.0.1",
"request": "2.79.0",
"rx": "4.1.0",
"sequelize": "3.27.0",
"sqlite3": "https://github.com/bengotow/node-sqlite3/archive/bengotow/usleep-v3.1.4.tar.gz",
"striptags": "2.1.1",
"underscore": "1.8.3",
"utf7": "^1.0.2",
"striptags": "2.1.1",
"vision": "4.1.0"
"vision": "4.1.0",
"encoding": "0.1.12"
},
"scripts": {
"test": "../../../../node_modules/.bin/electron ../../../../ --test --enable-logging --spec-directory=$(pwd)/spec"

View file

@ -187,7 +187,9 @@ class FetchMessagesInFolder {
if (['text/plain', 'text/html', 'application/pgp-encrypted'].includes(mimetype)) {
desired.push({
id: part.partID,
encoding: part.encoding,
// encoding and charset may be null
transferEncoding: part.encoding,
charset: part.params ? part.params.charset : null,
mimetype,
});
}

View file

@ -1,13 +1,15 @@
const _ = require('underscore');
const cryptography = require('crypto');
const utf7 = require('utf7').imap;
const mimelib = require('mimelib');
const QuotedPrintable = require('quoted-printable');
const striptags = require('striptags');
const encoding = require('encoding');
const {Imap} = require('isomorphic-core');
const Errors = require('./errors');
const SNIPPET_SIZE = 100
// aiming for the former in length, but the latter is the hard db cutoff
const SNIPPET_SIZE = 100;
const SNIPPET_MAX_SIZE = 255;
function extractContacts(values = []) {
return values.map(v => {
@ -40,32 +42,41 @@ function setReplyHeaders(newMessage, prevMessage) {
}
}
/*
Since we only fetch the MIME structure and specific desired MIME parts from
IMAP, we unfortunately can't use an existing library like mailparser to parse
the message, and have to do fun stuff like deal with character sets and
content-transfer-encodings ourselves.
*/
async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder}) {
const {Label} = db
const {attributes} = imapMessage
const body = {}
const {headers, attributes} = imapMessage
const xGmLabels = attributes['x-gm-labels']
for (const {id, mimetype, encoding} of desiredParts) {
if (!encoding) {
body[mimetype] = imapMessage.parts[id];
} else if (encoding.toLowerCase() === 'quoted-printable') {
body[mimetype] = QuotedPrintable.decode(imapMessage.parts[id]);
} else if (encoding.toLowerCase() === '7bit') {
body[mimetype] = utf7.decode(imapMessage.parts[id]);
} else if (encoding.toLowerCase() === '8bit') {
body[mimetype] = Buffer.from(imapMessage.parts[id], 'utf8').toString();
} else if (encoding && ['ascii', 'utf8', 'utf16le', 'ucs2', 'base64', 'latin1', 'binary', 'hex'].includes(encoding.toLowerCase())) {
body[mimetype] = Buffer.from(imapMessage.parts[id], encoding.toLowerCase()).toString();
for (const {id, mimetype, transferEncoding, charset} of desiredParts) {
// see https://www.w3.org/Protocols/rfc1341/5_Content-Transfer-Encoding.html
if (!transferEncoding || new Set(['7bit', '8bit']).has(transferEncoding.toLowerCase())) {
// NO transfer encoding has been performed --- how to decode to a string
// depends ONLY on the charset, which defaults to 'ascii' according to
// https://tools.ietf.org/html/rfc2045#section-5.2
const convertedBuffer = encoding.convert(imapMessage.parts[id], 'utf-8', charset || 'ascii')
body[mimetype] = convertedBuffer.toString('utf-8');
} else if (transferEncoding.toLowerCase() === 'quoted-printable') {
body[mimetype] = mimelib.decodeQuotedPrintable(imapMessage.parts[id], charset || 'ascii');
} else if (transferEncoding.toLowerCase() === 'base64') {
body[mimetype] = mimelib.decodeBase64(imapMessage.parts[id], charset || 'ascii');
} else {
return Promise.reject(new Error(`Unknown encoding ${encoding}, mimetype ${mimetype}`))
// 'binary' and custom x-token content-transfer-encodings
return Promise.reject(new Error(`Unsupported Content-Transfer-Encoding ${transferEncoding}, mimetype ${mimetype}`))
}
}
const headers = imapMessage.headers.toString('ascii');
const parsedHeaders = Imap.parseHeader(headers);
for (const key of ['x-gm-thrid', 'x-gm-msgid', 'x-gm-labels']) {
parsedHeaders[key] = attributes[key];
}
const values = {
const parsedMessage = {
id: hashForHeaders(getHeadersForId(parsedHeaders)),
to: extractContacts(parsedHeaders.to),
cc: extractContacts(parsedHeaders.cc),
@ -74,7 +85,7 @@ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder})
replyTo: extractContacts(parsedHeaders['reply-to']),
accountId: accountId,
body: body['text/html'] || body['text/plain'] || body['application/pgp-encrypted'] || '',
snippet: body['text/plain'] ? body['text/plain'].substr(0, 255) : null,
snippet: null,
unread: !attributes.flags.includes('\\Seen'),
starred: attributes.flags.includes('\\Flagged'),
date: attributes.date,
@ -90,29 +101,41 @@ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder})
// preserve whitespacing on plaintext emails -- has the side effect of monospacing, but
// that seems OK and perhaps sometimes even desired (for e.g. ascii art, alignment)
if (!body['text/html'] && body['text/plain']) {
values.body = `<pre class="nylas-plaintext">${values.body}</pre>`;
parsedMessage.body = `<pre class="nylas-plaintext">${parsedMessage.body}</pre>`;
}
// populate initial snippet
if (body['text/plain']) {
parsedMessage.snippet = body['text/plain'].trim().substr(0, SNIPPET_MAX_SIZE);
} else if (parsedMessage.body) {
// create snippet from body, which is most likely html. we strip tags but
// don't currently support stripping embedded CSS
parsedMessage.snippet = striptags(parsedMessage.body).trim().substr(0,
Math.min(parsedMessage.body.length, SNIPPET_MAX_SIZE));
}
// clean up and trim snippet
if (parsedMessage.snippet) {
// TODO: strip quoted text from snippets also
if (values.snippet) {
// trim and clean snippet which is alreay present (from values plaintext)
values.snippet = values.snippet.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ')
const loc = values.snippet.indexOf(' ', SNIPPET_SIZE);
if (loc !== -1) {
values.snippet = values.snippet.substr(0, loc);
parsedMessage.snippet = parsedMessage.snippet.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ')
// trim down to approx. SNIPPET_SIZE w/out cutting off words right in the
// middle (if possible)
const wordBreak = parsedMessage.snippet.indexOf(' ', SNIPPET_SIZE);
if (wordBreak !== -1) {
parsedMessage.snippet = parsedMessage.snippet.substr(0, wordBreak);
}
} else if (values.body) {
// create snippet from body, which is most likely html
values.snippet = striptags(values.body).trim().substr(0, Math.min(values.body.length, SNIPPET_SIZE));
}
values.folder = folder
parsedMessage.folder = folder
// TODO: unclear if this is necessary given we already have parsed labels
const xGmLabels = attributes['x-gm-labels']
if (xGmLabels) {
values.folderImapXGMLabels = JSON.stringify(xGmLabels)
values.labels = await Label.findXGMLabels(xGmLabels)
parsedMessage.folderImapXGMLabels = JSON.stringify(xGmLabels)
parsedMessage.labels = await Label.findXGMLabels(xGmLabels)
}
return values;
return parsedMessage;
}
function fromJSON(db, data) {