From 4f75f7435f8773a7f1aef09d58a7452b8247116b Mon Sep 17 00:00:00 2001 From: Ben Gotow Date: Wed, 31 Jul 2019 03:50:56 -0500 Subject: [PATCH] Improve removal of quoted text added by terrible old mail clients --- app/src/services/quoted-html-transformer.ts | 209 ++++++++++++++------ 1 file changed, 150 insertions(+), 59 deletions(-) diff --git a/app/src/services/quoted-html-transformer.ts b/app/src/services/quoted-html-transformer.ts index 78756c49d..55235c0ec 100644 --- a/app/src/services/quoted-html-transformer.ts +++ b/app/src/services/quoted-html-transformer.ts @@ -2,13 +2,30 @@ import quoteStringDetector from './quote-string-detector'; import unwrappedSignatureDetector from './unwrapped-signature-detector'; const { FIRST_ORDERED_NODE_TYPE } = XPathResult; +const isEmptyishTextContent = el => { + // either '' or '---' (often left over from sig / confidentiality notice removal) + const trimmed = el.textContent.trim(); + return trimmed === '' || /^\-+$/.test(trimmed); +}; + +const looksLikeTrackingPixel = img => { + // we want to avoid hiding quoted text if the user has added an image beneath it, but only + // if that image is more than 1px in size... + const w = Number(img.getAttribute('width') || (img.style.width || '').replace('px', '') || 10000); + const h = Number( + img.getAttribute('height') || (img.style.height || '').replace('px', '') || 10000 + ); + return w <= 1 && h <= 1; +}; + class QuotedHTMLTransformer { annotationClass = 'mailspring-quoted-text-segment'; hasQuotedHTML(html) { const doc = this._parseHTML(html); - const quoteElements = this._findQuoteElements(doc); - return quoteElements.length > 0; + this._removeImagesStrippedByAnotherClient(doc); + this._removeTrailingFootersAndWhitespace(doc); + return this._findQuoteElements(doc).length > 0; } // Public: Removes quoted text from an HTML string @@ -27,15 +44,15 @@ class QuotedHTMLTransformer { // removeQuotedHTML(html, options = { keepIfWholeBodyIsQuote: true }) { const doc = this._parseHTML(html); - + this._removeImagesStrippedByAnotherClient(doc); + this._removeTrailingFootersAndWhitespace(doc); for (const el of this._findQuoteElements(doc)) { if (el) { el.remove(); } } - // It's possible that the entire body was quoted text anyway and we've - // removed everything. + // It's possible that the entire body was quoted text anyway and we've removed everything. if (options.keepIfWholeBodyIsQuote) { if (!doc.body || !doc.children[0] || doc.body.textContent.trim().length === 0) { return this._outputHTMLFor(this._parseHTML(html), { initialHTML: html }); @@ -52,7 +69,8 @@ class QuotedHTMLTransformer { } } - this._removeImagesStrippedByAnotherClient(doc); + // after removing all the quoted text, delete any whitespace that appeared between blocks + // so the email doesn't end with
x 50 this._removeUnnecessaryWhitespace(doc); return this._outputHTMLFor(doc, { initialHTML: html }); @@ -121,14 +139,14 @@ class QuotedHTMLTransformer { while (el.lastChild) { const child = el.lastChild; if (child.nodeType === Node.TEXT_NODE) { - if (child.textContent.trim() === '') { + if (isEmptyishTextContent(child)) { child.remove(); continue; } } if (['BR', 'P', 'DIV', 'SPAN', 'HR'].includes(child.nodeName)) { removeTrailingWhitespaceChildren(child); - if (child.childElementCount === 0 && child.textContent.trim() === '') { + if (child.childElementCount === 0 && isEmptyishTextContent(child)) { child.remove(); continue; } @@ -185,12 +203,12 @@ class QuotedHTMLTransformer { this._findGmailQuotes, this._findBlockquoteQuotes, this._findQuotesAfterMessageHeaderBlock, - this._findConfidentialityNotice, + this._findQuotesAfter__OriginalMessage__, ]; let quoteElements = []; for (const parser of parsers) { - quoteElements = quoteElements.concat(parser(doc) || []); + quoteElements = quoteElements.concat(parser.call(this, doc) || []); } // Find top-level nodes that look like a signature - some clients append @@ -238,10 +256,10 @@ class QuotedHTMLTransformer { if (node.childNodes) { pile.push(...node.childNodes); } - if (node.nodeName === 'IMG') { + if (node.nodeName === 'IMG' && !looksLikeTrackingPixel(node)) { return true; } - if (node.nodeType === Node.TEXT_NODE && node.textContent.trim().length > 0) { + if (node.nodeType === Node.TEXT_NODE && !isEmptyishTextContent(node)) { return true; } } @@ -262,24 +280,87 @@ class QuotedHTMLTransformer { return Array.from(doc.querySelectorAll('blockquote')); } - _findConfidentialityNotice(doc) { + _removeTrailingFootersAndWhitespace(doc) { + let els = []; + let iters = 0; + while ((els = this._findTrailingFooter(doc))) { + iters++; + els.forEach(el => el.remove()); + this._removeUnnecessaryWhitespace(doc); + if (iters > 20) { + return; + } + } + } + + _findTrailingFooter(doc) { // Traverse from the body down the tree of "last" nodes looking for a - // Confidentiality Notice TEXT_NODE. We need to count this node as quoted - // text or it'll be handled as an inline reply and totally disable quoted - // text removal. + // Confidentiality Notice, "To unsubscribe from this group", etc. + // We strip these nodes because otherwise the quoted text logic + // thinks that they are inline replies to quoted text. + const footerRegexps = [ + /^Confidentiality Notice/i, + /strictly confidential/i, + /This email message is/i, + /You received this message because/i, + ]; + let head = doc.body; while (head) { const tc = head.textContent.trim(); - if (head.nodeType === Node.TEXT_NODE && tc.startsWith('Confidentiality Notice')) { + if (head.nodeType === Node.TEXT_NODE) { + if (footerRegexps.find(r => r.test(tc))) { + return [head]; + } + } + + // chop off Google groups unsubscribe instructions which are appended + // to the end but annoyingly not in a container. + if ( + tc === '.' && + head.previousSibling && + head.previousSibling.previousSibling && + head.previousSibling.previousSibling.textContent.trim().startsWith('To unsubscribe') + ) { + return [head, head.previousSibling, head.previousSibling.previousSibling]; + } + + // chop off gmail_signature if the user has it configured to go at the absolute + // bottom of the email + if (head.nodeName === 'DIV' && head.classList.contains('gmail_signature')) { return [head]; } + if (head.childNodes.length === 0 && tc === '') { head = head.previousSibling; } else { head = head.lastChild; } } - return []; + return null; + } + + _findQuotesAfter__OriginalMessage__(doc) { + // these are pulled from specific messages seen in the wild. I think that doing this + // via Xpath is still more performant than writing code to traverse + examine? + const originalMessageMarker = doc.evaluate( + `//div[. = '-------- Original message --------'] | + //div[. = '------ Original Message ------'] | + //div[starts-with(., '-----Original Message-----')] | + //i[. = '-------Original Message-------'] | + //div[. = '---Original---']`, + + doc.body, + null, + FIRST_ORDERED_NODE_TYPE, + null + ).singleNodeValue; + + if (!originalMessageMarker) { + return []; + } + + return this._collectAllNodesBelow(originalMessageMarker); } _findQuotesAfterMessageHeaderBlock(doc) { @@ -291,6 +372,8 @@ class QuotedHTMLTransformer { const dateXPath = ` //b[. = 'Sent:'] | //b[. = 'Date:'] | + //b[. = 'Sent: '] | + //b[. = 'Date: '] | //span[. = 'Sent: '] | //span[. = 'Date: '] | //span[. = 'Sent:'] | @@ -298,51 +381,59 @@ class QuotedHTMLTransformer { const dateMarker = doc.evaluate(dateXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null) .singleNodeValue; - if (dateMarker) { - // check to see if the parent container also contains the other two - const headerContainer = dateMarker.parentElement; - let matches = 0; - for (const node of Array.from(headerContainer.children)) { - const tc = (node as any).textContent.trim(); - if (tc === 'To:' || tc === 'Subject:') { - matches++; - } - } - if (matches === 2) { - // got a hit! let's cut some text. - const quotedTextNodes = []; + if (!dateMarker) { + return []; + } - // Special case to add "From:" because it's often detatched from the rest of the - // header fields. We just add it where ever it's located. - const fromXPath = "//b[. = 'From:'] | //span[. = 'From:']| //span[. = 'From: ']"; - let from = doc.evaluate(fromXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null) - .singleNodeValue; - - if (from) { - if (from.nodeName === 'SPAN') { - from = from.parentElement; - } - quotedTextNodes.push(from); - } - - // The headers container and everything past it in the document is quoted text. - // This traverses the DOM, walking up the tree and adding all siblings below - // our current path to the array. - let head = headerContainer; - while (head) { - quotedTextNodes.push(head); - while (head && !head.nextElementSibling) { - head = head.parentElement; - } - if (head) { - head = head.nextElementSibling; - } - } - return quotedTextNodes; + // check to see if the parent container also contains the other two + const headerContainer = dateMarker.parentElement; + let matches = 0; + for (const node of Array.from(headerContainer.children)) { + const tc = (node as any).textContent.trim(); + if (tc === 'To:' || tc === 'Subject:') { + matches++; } } - return []; + if (matches !== 2) { + return []; + } + + // got a hit! let's cut some text. + const quotedTextNodes = this._collectAllNodesBelow(headerContainer); + + // Special case to add "From:" because it's often detatched from the rest of the + // header fields. We just add it where ever it's located. + const fromXPath = "//b[. = 'From:'] | //span[. = 'From:']| //span[. = 'From: ']"; + let from = doc.evaluate(fromXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null) + .singleNodeValue; + + if (from) { + if (from.nodeName === 'SPAN') { + from = from.parentElement; + } + quotedTextNodes.push(from); + } + + return quotedTextNodes; } + + _collectAllNodesBelow = headerContainer => { + // The headers container and everything past it in the document is quoted text. + // This traverses the DOM, walking up the tree and adding all siblings below + // our current path to the array. + let head = headerContainer; + let results = []; + while (head) { + results.push(head); + while (head && !head.nextElementSibling) { + head = head.parentElement; + } + if (head) { + head = head.nextElementSibling; + } + } + return results; + }; } export default new QuotedHTMLTransformer();