diff --git a/packages/isomorphic-core/src/imap-box.js b/packages/isomorphic-core/src/imap-box.js index 01e17f4b7..10e2545a1 100644 --- a/packages/isomorphic-core/src/imap-box.js +++ b/packages/isomorphic-core/src/imap-box.js @@ -68,7 +68,7 @@ class IMAPBox { }); stream.once('end', () => { - const full = Buffer.concat(chunks).toString('utf8'); + const full = Buffer.concat(chunks); if (info.which === 'HEADER') { headers = full; } else { @@ -77,6 +77,10 @@ class IMAPBox { }); }); imapMessage.once('end', () => { + // attributes is an object containing ascii strings, but parts and + // headers are undecoded binary Buffers (since the data for mime + // parts cannot be decoded to strings without looking up charset data + // in metadata, and this function's job is only to fetch the raw data) forEachMessageCallback({attributes, headers, parts}); }); }) diff --git a/packages/local-sync/package.json b/packages/local-sync/package.json index 000ad00ca..a82f5968f 100644 --- a/packages/local-sync/package.json +++ b/packages/local-sync/package.json @@ -16,16 +16,14 @@ "joi": "8.4.2", "mimelib": "0.2.19", "nodemailer": "2.5.0", - "quoted-printable": "1.0.1", "request": "2.79.0", "rx": "4.1.0", "sequelize": "3.27.0", "sqlite3": "https://github.com/bengotow/node-sqlite3/archive/bengotow/usleep-v3.1.4.tar.gz", - "striptags": "2.1.1", "underscore": "1.8.3", - "utf7": "^1.0.2", "striptags": "2.1.1", - "vision": "4.1.0" + "vision": "4.1.0", + "encoding": "0.1.12" }, "scripts": { "test": "../../../../node_modules/.bin/electron ../../../../ --test --enable-logging --spec-directory=$(pwd)/spec" diff --git a/packages/local-sync/src/local-sync-worker/imap/fetch-messages-in-folder.js b/packages/local-sync/src/local-sync-worker/imap/fetch-messages-in-folder.js index ecdf17094..d219e284a 100644 --- a/packages/local-sync/src/local-sync-worker/imap/fetch-messages-in-folder.js +++ b/packages/local-sync/src/local-sync-worker/imap/fetch-messages-in-folder.js @@ -187,7 +187,9 @@ class FetchMessagesInFolder { if (['text/plain', 'text/html', 'application/pgp-encrypted'].includes(mimetype)) { desired.push({ id: part.partID, - encoding: part.encoding, + // encoding and charset may be null + transferEncoding: part.encoding, + charset: part.params ? part.params.charset : null, mimetype, }); } diff --git a/packages/local-sync/src/shared/message-factory.js b/packages/local-sync/src/shared/message-factory.js index 14ad4370b..48aa282fa 100644 --- a/packages/local-sync/src/shared/message-factory.js +++ b/packages/local-sync/src/shared/message-factory.js @@ -1,13 +1,15 @@ const _ = require('underscore'); const cryptography = require('crypto'); -const utf7 = require('utf7').imap; const mimelib = require('mimelib'); -const QuotedPrintable = require('quoted-printable'); const striptags = require('striptags'); +const encoding = require('encoding'); + const {Imap} = require('isomorphic-core'); const Errors = require('./errors'); -const SNIPPET_SIZE = 100 +// aiming for the former in length, but the latter is the hard db cutoff +const SNIPPET_SIZE = 100; +const SNIPPET_MAX_SIZE = 255; function extractContacts(values = []) { return values.map(v => { @@ -40,32 +42,41 @@ function setReplyHeaders(newMessage, prevMessage) { } } +/* +Since we only fetch the MIME structure and specific desired MIME parts from +IMAP, we unfortunately can't use an existing library like mailparser to parse +the message, and have to do fun stuff like deal with character sets and +content-transfer-encodings ourselves. +*/ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder}) { const {Label} = db + const {attributes} = imapMessage + const body = {} - const {headers, attributes} = imapMessage - const xGmLabels = attributes['x-gm-labels'] - for (const {id, mimetype, encoding} of desiredParts) { - if (!encoding) { - body[mimetype] = imapMessage.parts[id]; - } else if (encoding.toLowerCase() === 'quoted-printable') { - body[mimetype] = QuotedPrintable.decode(imapMessage.parts[id]); - } else if (encoding.toLowerCase() === '7bit') { - body[mimetype] = utf7.decode(imapMessage.parts[id]); - } else if (encoding.toLowerCase() === '8bit') { - body[mimetype] = Buffer.from(imapMessage.parts[id], 'utf8').toString(); - } else if (encoding && ['ascii', 'utf8', 'utf16le', 'ucs2', 'base64', 'latin1', 'binary', 'hex'].includes(encoding.toLowerCase())) { - body[mimetype] = Buffer.from(imapMessage.parts[id], encoding.toLowerCase()).toString(); + for (const {id, mimetype, transferEncoding, charset} of desiredParts) { + // see https://www.w3.org/Protocols/rfc1341/5_Content-Transfer-Encoding.html + if (!transferEncoding || new Set(['7bit', '8bit']).has(transferEncoding.toLowerCase())) { + // NO transfer encoding has been performed --- how to decode to a string + // depends ONLY on the charset, which defaults to 'ascii' according to + // https://tools.ietf.org/html/rfc2045#section-5.2 + const convertedBuffer = encoding.convert(imapMessage.parts[id], 'utf-8', charset || 'ascii') + body[mimetype] = convertedBuffer.toString('utf-8'); + } else if (transferEncoding.toLowerCase() === 'quoted-printable') { + body[mimetype] = mimelib.decodeQuotedPrintable(imapMessage.parts[id], charset || 'ascii'); + } else if (transferEncoding.toLowerCase() === 'base64') { + body[mimetype] = mimelib.decodeBase64(imapMessage.parts[id], charset || 'ascii'); } else { - return Promise.reject(new Error(`Unknown encoding ${encoding}, mimetype ${mimetype}`)) + // 'binary' and custom x-token content-transfer-encodings + return Promise.reject(new Error(`Unsupported Content-Transfer-Encoding ${transferEncoding}, mimetype ${mimetype}`)) } } + const headers = imapMessage.headers.toString('ascii'); const parsedHeaders = Imap.parseHeader(headers); for (const key of ['x-gm-thrid', 'x-gm-msgid', 'x-gm-labels']) { parsedHeaders[key] = attributes[key]; } - const values = { + const parsedMessage = { id: hashForHeaders(getHeadersForId(parsedHeaders)), to: extractContacts(parsedHeaders.to), cc: extractContacts(parsedHeaders.cc), @@ -74,7 +85,7 @@ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder}) replyTo: extractContacts(parsedHeaders['reply-to']), accountId: accountId, body: body['text/html'] || body['text/plain'] || body['application/pgp-encrypted'] || '', - snippet: body['text/plain'] ? body['text/plain'].substr(0, 255) : null, + snippet: null, unread: !attributes.flags.includes('\\Seen'), starred: attributes.flags.includes('\\Flagged'), date: attributes.date, @@ -90,29 +101,41 @@ async function parseFromImap(imapMessage, desiredParts, {db, accountId, folder}) // preserve whitespacing on plaintext emails -- has the side effect of monospacing, but // that seems OK and perhaps sometimes even desired (for e.g. ascii art, alignment) if (!body['text/html'] && body['text/plain']) { - values.body = `
${values.body}
`; + parsedMessage.body = `
${parsedMessage.body}
`; } + // populate initial snippet + if (body['text/plain']) { + parsedMessage.snippet = body['text/plain'].trim().substr(0, SNIPPET_MAX_SIZE); + } else if (parsedMessage.body) { + // create snippet from body, which is most likely html. we strip tags but + // don't currently support stripping embedded CSS + parsedMessage.snippet = striptags(parsedMessage.body).trim().substr(0, + Math.min(parsedMessage.body.length, SNIPPET_MAX_SIZE)); + } + + // clean up and trim snippet + if (parsedMessage.snippet) { // TODO: strip quoted text from snippets also - if (values.snippet) { - // trim and clean snippet which is alreay present (from values plaintext) - values.snippet = values.snippet.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ') - const loc = values.snippet.indexOf(' ', SNIPPET_SIZE); - if (loc !== -1) { - values.snippet = values.snippet.substr(0, loc); + parsedMessage.snippet = parsedMessage.snippet.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ') + // trim down to approx. SNIPPET_SIZE w/out cutting off words right in the + // middle (if possible) + const wordBreak = parsedMessage.snippet.indexOf(' ', SNIPPET_SIZE); + if (wordBreak !== -1) { + parsedMessage.snippet = parsedMessage.snippet.substr(0, wordBreak); } - } else if (values.body) { - // create snippet from body, which is most likely html - values.snippet = striptags(values.body).trim().substr(0, Math.min(values.body.length, SNIPPET_SIZE)); } - values.folder = folder + parsedMessage.folder = folder + + // TODO: unclear if this is necessary given we already have parsed labels + const xGmLabels = attributes['x-gm-labels'] if (xGmLabels) { - values.folderImapXGMLabels = JSON.stringify(xGmLabels) - values.labels = await Label.findXGMLabels(xGmLabels) + parsedMessage.folderImapXGMLabels = JSON.stringify(xGmLabels) + parsedMessage.labels = await Label.findXGMLabels(xGmLabels) } - return values; + return parsedMessage; } function fromJSON(db, data) {