Improve removal of quoted text added by terrible old mail clients

This commit is contained in:
Ben Gotow 2019-07-31 03:50:56 -05:00
parent 87a4853db9
commit 4f75f7435f

View file

@ -2,13 +2,30 @@ import quoteStringDetector from './quote-string-detector';
import unwrappedSignatureDetector from './unwrapped-signature-detector'; import unwrappedSignatureDetector from './unwrapped-signature-detector';
const { FIRST_ORDERED_NODE_TYPE } = XPathResult; const { FIRST_ORDERED_NODE_TYPE } = XPathResult;
const isEmptyishTextContent = el => {
// either '' or '---' (often left over from sig / confidentiality notice removal)
const trimmed = el.textContent.trim();
return trimmed === '' || /^\-+$/.test(trimmed);
};
const looksLikeTrackingPixel = img => {
// we want to avoid hiding quoted text if the user has added an image beneath it, but only
// if that image is more than 1px in size...
const w = Number(img.getAttribute('width') || (img.style.width || '').replace('px', '') || 10000);
const h = Number(
img.getAttribute('height') || (img.style.height || '').replace('px', '') || 10000
);
return w <= 1 && h <= 1;
};
class QuotedHTMLTransformer { class QuotedHTMLTransformer {
annotationClass = 'mailspring-quoted-text-segment'; annotationClass = 'mailspring-quoted-text-segment';
hasQuotedHTML(html) { hasQuotedHTML(html) {
const doc = this._parseHTML(html); const doc = this._parseHTML(html);
const quoteElements = this._findQuoteElements(doc); this._removeImagesStrippedByAnotherClient(doc);
return quoteElements.length > 0; this._removeTrailingFootersAndWhitespace(doc);
return this._findQuoteElements(doc).length > 0;
} }
// Public: Removes quoted text from an HTML string // Public: Removes quoted text from an HTML string
@ -27,15 +44,15 @@ class QuotedHTMLTransformer {
// //
removeQuotedHTML(html, options = { keepIfWholeBodyIsQuote: true }) { removeQuotedHTML(html, options = { keepIfWholeBodyIsQuote: true }) {
const doc = this._parseHTML(html); const doc = this._parseHTML(html);
this._removeImagesStrippedByAnotherClient(doc);
this._removeTrailingFootersAndWhitespace(doc);
for (const el of this._findQuoteElements(doc)) { for (const el of this._findQuoteElements(doc)) {
if (el) { if (el) {
el.remove(); el.remove();
} }
} }
// It's possible that the entire body was quoted text anyway and we've // It's possible that the entire body was quoted text anyway and we've removed everything.
// removed everything.
if (options.keepIfWholeBodyIsQuote) { if (options.keepIfWholeBodyIsQuote) {
if (!doc.body || !doc.children[0] || doc.body.textContent.trim().length === 0) { if (!doc.body || !doc.children[0] || doc.body.textContent.trim().length === 0) {
return this._outputHTMLFor(this._parseHTML(html), { initialHTML: html }); return this._outputHTMLFor(this._parseHTML(html), { initialHTML: html });
@ -52,7 +69,8 @@ class QuotedHTMLTransformer {
} }
} }
this._removeImagesStrippedByAnotherClient(doc); // after removing all the quoted text, delete any whitespace that appeared between blocks
// so the email doesn't end with <br> x 50
this._removeUnnecessaryWhitespace(doc); this._removeUnnecessaryWhitespace(doc);
return this._outputHTMLFor(doc, { initialHTML: html }); return this._outputHTMLFor(doc, { initialHTML: html });
@ -121,14 +139,14 @@ class QuotedHTMLTransformer {
while (el.lastChild) { while (el.lastChild) {
const child = el.lastChild; const child = el.lastChild;
if (child.nodeType === Node.TEXT_NODE) { if (child.nodeType === Node.TEXT_NODE) {
if (child.textContent.trim() === '') { if (isEmptyishTextContent(child)) {
child.remove(); child.remove();
continue; continue;
} }
} }
if (['BR', 'P', 'DIV', 'SPAN', 'HR'].includes(child.nodeName)) { if (['BR', 'P', 'DIV', 'SPAN', 'HR'].includes(child.nodeName)) {
removeTrailingWhitespaceChildren(child); removeTrailingWhitespaceChildren(child);
if (child.childElementCount === 0 && child.textContent.trim() === '') { if (child.childElementCount === 0 && isEmptyishTextContent(child)) {
child.remove(); child.remove();
continue; continue;
} }
@ -185,12 +203,12 @@ class QuotedHTMLTransformer {
this._findGmailQuotes, this._findGmailQuotes,
this._findBlockquoteQuotes, this._findBlockquoteQuotes,
this._findQuotesAfterMessageHeaderBlock, this._findQuotesAfterMessageHeaderBlock,
this._findConfidentialityNotice, this._findQuotesAfter__OriginalMessage__,
]; ];
let quoteElements = []; let quoteElements = [];
for (const parser of parsers) { for (const parser of parsers) {
quoteElements = quoteElements.concat(parser(doc) || []); quoteElements = quoteElements.concat(parser.call(this, doc) || []);
} }
// Find top-level nodes that look like a signature - some clients append // Find top-level nodes that look like a signature - some clients append
@ -238,10 +256,10 @@ class QuotedHTMLTransformer {
if (node.childNodes) { if (node.childNodes) {
pile.push(...node.childNodes); pile.push(...node.childNodes);
} }
if (node.nodeName === 'IMG') { if (node.nodeName === 'IMG' && !looksLikeTrackingPixel(node)) {
return true; return true;
} }
if (node.nodeType === Node.TEXT_NODE && node.textContent.trim().length > 0) { if (node.nodeType === Node.TEXT_NODE && !isEmptyishTextContent(node)) {
return true; return true;
} }
} }
@ -262,24 +280,87 @@ class QuotedHTMLTransformer {
return Array.from(doc.querySelectorAll('blockquote')); return Array.from(doc.querySelectorAll('blockquote'));
} }
_findConfidentialityNotice(doc) { _removeTrailingFootersAndWhitespace(doc) {
let els = [];
let iters = 0;
while ((els = this._findTrailingFooter(doc))) {
iters++;
els.forEach(el => el.remove());
this._removeUnnecessaryWhitespace(doc);
if (iters > 20) {
return;
}
}
}
_findTrailingFooter(doc) {
// Traverse from the body down the tree of "last" nodes looking for a // Traverse from the body down the tree of "last" nodes looking for a
// Confidentiality Notice TEXT_NODE. We need to count this node as quoted // Confidentiality Notice, "To unsubscribe from this group", etc.
// text or it'll be handled as an inline reply and totally disable quoted // We strip these nodes because otherwise the quoted text logic
// text removal. // thinks that they are inline replies to quoted text.
const footerRegexps = [
/^Confidentiality Notice/i,
/strictly confidential/i,
/This email message is/i,
/You received this message because/i,
];
let head = doc.body; let head = doc.body;
while (head) { while (head) {
const tc = head.textContent.trim(); const tc = head.textContent.trim();
if (head.nodeType === Node.TEXT_NODE && tc.startsWith('Confidentiality Notice')) { if (head.nodeType === Node.TEXT_NODE) {
if (footerRegexps.find(r => r.test(tc))) {
return [head];
}
}
// chop off Google groups unsubscribe instructions which are appended
// to the end but annoyingly not in a container.
if (
tc === '.' &&
head.previousSibling &&
head.previousSibling.previousSibling &&
head.previousSibling.previousSibling.textContent.trim().startsWith('To unsubscribe')
) {
return [head, head.previousSibling, head.previousSibling.previousSibling];
}
// chop off gmail_signature if the user has it configured to go at the absolute
// bottom of the email
if (head.nodeName === 'DIV' && head.classList.contains('gmail_signature')) {
return [head]; return [head];
} }
if (head.childNodes.length === 0 && tc === '') { if (head.childNodes.length === 0 && tc === '') {
head = head.previousSibling; head = head.previousSibling;
} else { } else {
head = head.lastChild; head = head.lastChild;
} }
} }
return []; return null;
}
_findQuotesAfter__OriginalMessage__(doc) {
// these are pulled from specific messages seen in the wild. I think that doing this
// via Xpath is still more performant than writing code to traverse + examine?
const originalMessageMarker = doc.evaluate(
`//div[. = '-------- Original message --------'] |
//div[. = '------ Original Message ------'] |
//div[starts-with(., '-----Original Message-----')] |
//i[. = '-------Original Message-------'] |
//div[. = '---Original---']`,
doc.body,
null,
FIRST_ORDERED_NODE_TYPE,
null
).singleNodeValue;
if (!originalMessageMarker) {
return [];
}
return this._collectAllNodesBelow(originalMessageMarker);
} }
_findQuotesAfterMessageHeaderBlock(doc) { _findQuotesAfterMessageHeaderBlock(doc) {
@ -291,6 +372,8 @@ class QuotedHTMLTransformer {
const dateXPath = ` const dateXPath = `
//b[. = 'Sent:'] | //b[. = 'Sent:'] |
//b[. = 'Date:'] | //b[. = 'Date:'] |
//b[. = 'Sent: '] |
//b[. = 'Date: '] |
//span[. = 'Sent: '] | //span[. = 'Sent: '] |
//span[. = 'Date: '] | //span[. = 'Date: '] |
//span[. = 'Sent:'] | //span[. = 'Sent:'] |
@ -298,51 +381,59 @@ class QuotedHTMLTransformer {
const dateMarker = doc.evaluate(dateXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null) const dateMarker = doc.evaluate(dateXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null)
.singleNodeValue; .singleNodeValue;
if (dateMarker) { if (!dateMarker) {
// check to see if the parent container also contains the other two return [];
const headerContainer = dateMarker.parentElement; }
let matches = 0;
for (const node of Array.from(headerContainer.children)) {
const tc = (node as any).textContent.trim();
if (tc === 'To:' || tc === 'Subject:') {
matches++;
}
}
if (matches === 2) {
// got a hit! let's cut some text.
const quotedTextNodes = [];
// Special case to add "From:" because it's often detatched from the rest of the // check to see if the parent container also contains the other two
// header fields. We just add it where ever it's located. const headerContainer = dateMarker.parentElement;
const fromXPath = "//b[. = 'From:'] | //span[. = 'From:']| //span[. = 'From: ']"; let matches = 0;
let from = doc.evaluate(fromXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null) for (const node of Array.from(headerContainer.children)) {
.singleNodeValue; const tc = (node as any).textContent.trim();
if (tc === 'To:' || tc === 'Subject:') {
if (from) { matches++;
if (from.nodeName === 'SPAN') {
from = from.parentElement;
}
quotedTextNodes.push(from);
}
// The headers container and everything past it in the document is quoted text.
// This traverses the DOM, walking up the tree and adding all siblings below
// our current path to the array.
let head = headerContainer;
while (head) {
quotedTextNodes.push(head);
while (head && !head.nextElementSibling) {
head = head.parentElement;
}
if (head) {
head = head.nextElementSibling;
}
}
return quotedTextNodes;
} }
} }
return []; if (matches !== 2) {
return [];
}
// got a hit! let's cut some text.
const quotedTextNodes = this._collectAllNodesBelow(headerContainer);
// Special case to add "From:" because it's often detatched from the rest of the
// header fields. We just add it where ever it's located.
const fromXPath = "//b[. = 'From:'] | //span[. = 'From:']| //span[. = 'From: ']";
let from = doc.evaluate(fromXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null)
.singleNodeValue;
if (from) {
if (from.nodeName === 'SPAN') {
from = from.parentElement;
}
quotedTextNodes.push(from);
}
return quotedTextNodes;
} }
_collectAllNodesBelow = headerContainer => {
// The headers container and everything past it in the document is quoted text.
// This traverses the DOM, walking up the tree and adding all siblings below
// our current path to the array.
let head = headerContainer;
let results = [];
while (head) {
results.push(head);
while (head && !head.nextElementSibling) {
head = head.parentElement;
}
if (head) {
head = head.nextElementSibling;
}
}
return results;
};
} }
export default new QuotedHTMLTransformer(); export default new QuotedHTMLTransformer();