mirror of
https://github.com/Foundry376/Mailspring.git
synced 2025-10-03 01:44:42 +08:00
Improve removal of quoted text added by terrible old mail clients
This commit is contained in:
parent
87a4853db9
commit
4f75f7435f
1 changed files with 150 additions and 59 deletions
|
@ -2,13 +2,30 @@ import quoteStringDetector from './quote-string-detector';
|
||||||
import unwrappedSignatureDetector from './unwrapped-signature-detector';
|
import unwrappedSignatureDetector from './unwrapped-signature-detector';
|
||||||
const { FIRST_ORDERED_NODE_TYPE } = XPathResult;
|
const { FIRST_ORDERED_NODE_TYPE } = XPathResult;
|
||||||
|
|
||||||
|
const isEmptyishTextContent = el => {
|
||||||
|
// either '' or '---' (often left over from sig / confidentiality notice removal)
|
||||||
|
const trimmed = el.textContent.trim();
|
||||||
|
return trimmed === '' || /^\-+$/.test(trimmed);
|
||||||
|
};
|
||||||
|
|
||||||
|
const looksLikeTrackingPixel = img => {
|
||||||
|
// we want to avoid hiding quoted text if the user has added an image beneath it, but only
|
||||||
|
// if that image is more than 1px in size...
|
||||||
|
const w = Number(img.getAttribute('width') || (img.style.width || '').replace('px', '') || 10000);
|
||||||
|
const h = Number(
|
||||||
|
img.getAttribute('height') || (img.style.height || '').replace('px', '') || 10000
|
||||||
|
);
|
||||||
|
return w <= 1 && h <= 1;
|
||||||
|
};
|
||||||
|
|
||||||
class QuotedHTMLTransformer {
|
class QuotedHTMLTransformer {
|
||||||
annotationClass = 'mailspring-quoted-text-segment';
|
annotationClass = 'mailspring-quoted-text-segment';
|
||||||
|
|
||||||
hasQuotedHTML(html) {
|
hasQuotedHTML(html) {
|
||||||
const doc = this._parseHTML(html);
|
const doc = this._parseHTML(html);
|
||||||
const quoteElements = this._findQuoteElements(doc);
|
this._removeImagesStrippedByAnotherClient(doc);
|
||||||
return quoteElements.length > 0;
|
this._removeTrailingFootersAndWhitespace(doc);
|
||||||
|
return this._findQuoteElements(doc).length > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Public: Removes quoted text from an HTML string
|
// Public: Removes quoted text from an HTML string
|
||||||
|
@ -27,15 +44,15 @@ class QuotedHTMLTransformer {
|
||||||
//
|
//
|
||||||
removeQuotedHTML(html, options = { keepIfWholeBodyIsQuote: true }) {
|
removeQuotedHTML(html, options = { keepIfWholeBodyIsQuote: true }) {
|
||||||
const doc = this._parseHTML(html);
|
const doc = this._parseHTML(html);
|
||||||
|
this._removeImagesStrippedByAnotherClient(doc);
|
||||||
|
this._removeTrailingFootersAndWhitespace(doc);
|
||||||
for (const el of this._findQuoteElements(doc)) {
|
for (const el of this._findQuoteElements(doc)) {
|
||||||
if (el) {
|
if (el) {
|
||||||
el.remove();
|
el.remove();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// It's possible that the entire body was quoted text anyway and we've
|
// It's possible that the entire body was quoted text anyway and we've removed everything.
|
||||||
// removed everything.
|
|
||||||
if (options.keepIfWholeBodyIsQuote) {
|
if (options.keepIfWholeBodyIsQuote) {
|
||||||
if (!doc.body || !doc.children[0] || doc.body.textContent.trim().length === 0) {
|
if (!doc.body || !doc.children[0] || doc.body.textContent.trim().length === 0) {
|
||||||
return this._outputHTMLFor(this._parseHTML(html), { initialHTML: html });
|
return this._outputHTMLFor(this._parseHTML(html), { initialHTML: html });
|
||||||
|
@ -52,7 +69,8 @@ class QuotedHTMLTransformer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
this._removeImagesStrippedByAnotherClient(doc);
|
// after removing all the quoted text, delete any whitespace that appeared between blocks
|
||||||
|
// so the email doesn't end with <br> x 50
|
||||||
this._removeUnnecessaryWhitespace(doc);
|
this._removeUnnecessaryWhitespace(doc);
|
||||||
|
|
||||||
return this._outputHTMLFor(doc, { initialHTML: html });
|
return this._outputHTMLFor(doc, { initialHTML: html });
|
||||||
|
@ -121,14 +139,14 @@ class QuotedHTMLTransformer {
|
||||||
while (el.lastChild) {
|
while (el.lastChild) {
|
||||||
const child = el.lastChild;
|
const child = el.lastChild;
|
||||||
if (child.nodeType === Node.TEXT_NODE) {
|
if (child.nodeType === Node.TEXT_NODE) {
|
||||||
if (child.textContent.trim() === '') {
|
if (isEmptyishTextContent(child)) {
|
||||||
child.remove();
|
child.remove();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (['BR', 'P', 'DIV', 'SPAN', 'HR'].includes(child.nodeName)) {
|
if (['BR', 'P', 'DIV', 'SPAN', 'HR'].includes(child.nodeName)) {
|
||||||
removeTrailingWhitespaceChildren(child);
|
removeTrailingWhitespaceChildren(child);
|
||||||
if (child.childElementCount === 0 && child.textContent.trim() === '') {
|
if (child.childElementCount === 0 && isEmptyishTextContent(child)) {
|
||||||
child.remove();
|
child.remove();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -185,12 +203,12 @@ class QuotedHTMLTransformer {
|
||||||
this._findGmailQuotes,
|
this._findGmailQuotes,
|
||||||
this._findBlockquoteQuotes,
|
this._findBlockquoteQuotes,
|
||||||
this._findQuotesAfterMessageHeaderBlock,
|
this._findQuotesAfterMessageHeaderBlock,
|
||||||
this._findConfidentialityNotice,
|
this._findQuotesAfter__OriginalMessage__,
|
||||||
];
|
];
|
||||||
|
|
||||||
let quoteElements = [];
|
let quoteElements = [];
|
||||||
for (const parser of parsers) {
|
for (const parser of parsers) {
|
||||||
quoteElements = quoteElements.concat(parser(doc) || []);
|
quoteElements = quoteElements.concat(parser.call(this, doc) || []);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find top-level nodes that look like a signature - some clients append
|
// Find top-level nodes that look like a signature - some clients append
|
||||||
|
@ -238,10 +256,10 @@ class QuotedHTMLTransformer {
|
||||||
if (node.childNodes) {
|
if (node.childNodes) {
|
||||||
pile.push(...node.childNodes);
|
pile.push(...node.childNodes);
|
||||||
}
|
}
|
||||||
if (node.nodeName === 'IMG') {
|
if (node.nodeName === 'IMG' && !looksLikeTrackingPixel(node)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (node.nodeType === Node.TEXT_NODE && node.textContent.trim().length > 0) {
|
if (node.nodeType === Node.TEXT_NODE && !isEmptyishTextContent(node)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -262,26 +280,89 @@ class QuotedHTMLTransformer {
|
||||||
return Array.from(doc.querySelectorAll('blockquote'));
|
return Array.from(doc.querySelectorAll('blockquote'));
|
||||||
}
|
}
|
||||||
|
|
||||||
_findConfidentialityNotice(doc) {
|
_removeTrailingFootersAndWhitespace(doc) {
|
||||||
|
let els = [];
|
||||||
|
let iters = 0;
|
||||||
|
while ((els = this._findTrailingFooter(doc))) {
|
||||||
|
iters++;
|
||||||
|
els.forEach(el => el.remove());
|
||||||
|
this._removeUnnecessaryWhitespace(doc);
|
||||||
|
if (iters > 20) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
_findTrailingFooter(doc) {
|
||||||
// Traverse from the body down the tree of "last" nodes looking for a
|
// Traverse from the body down the tree of "last" nodes looking for a
|
||||||
// Confidentiality Notice TEXT_NODE. We need to count this node as quoted
|
// Confidentiality Notice, "To unsubscribe from this group", etc.
|
||||||
// text or it'll be handled as an inline reply and totally disable quoted
|
// We strip these nodes because otherwise the quoted text logic
|
||||||
// text removal.
|
// thinks that they are inline replies to quoted text.
|
||||||
|
const footerRegexps = [
|
||||||
|
/^Confidentiality Notice/i,
|
||||||
|
/strictly confidential/i,
|
||||||
|
/This email message is/i,
|
||||||
|
/You received this message because/i,
|
||||||
|
];
|
||||||
|
|
||||||
let head = doc.body;
|
let head = doc.body;
|
||||||
while (head) {
|
while (head) {
|
||||||
const tc = head.textContent.trim();
|
const tc = head.textContent.trim();
|
||||||
if (head.nodeType === Node.TEXT_NODE && tc.startsWith('Confidentiality Notice')) {
|
if (head.nodeType === Node.TEXT_NODE) {
|
||||||
|
if (footerRegexps.find(r => r.test(tc))) {
|
||||||
return [head];
|
return [head];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// chop off Google groups unsubscribe instructions which are appended
|
||||||
|
// to the end but annoyingly not in a container.
|
||||||
|
if (
|
||||||
|
tc === '.' &&
|
||||||
|
head.previousSibling &&
|
||||||
|
head.previousSibling.previousSibling &&
|
||||||
|
head.previousSibling.previousSibling.textContent.trim().startsWith('To unsubscribe')
|
||||||
|
) {
|
||||||
|
return [head, head.previousSibling, head.previousSibling.previousSibling];
|
||||||
|
}
|
||||||
|
|
||||||
|
// chop off gmail_signature if the user has it configured to go at the absolute
|
||||||
|
// bottom of the email
|
||||||
|
if (head.nodeName === 'DIV' && head.classList.contains('gmail_signature')) {
|
||||||
|
return [head];
|
||||||
|
}
|
||||||
|
|
||||||
if (head.childNodes.length === 0 && tc === '') {
|
if (head.childNodes.length === 0 && tc === '') {
|
||||||
head = head.previousSibling;
|
head = head.previousSibling;
|
||||||
} else {
|
} else {
|
||||||
head = head.lastChild;
|
head = head.lastChild;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
_findQuotesAfter__OriginalMessage__(doc) {
|
||||||
|
// these are pulled from specific messages seen in the wild. I think that doing this
|
||||||
|
// via Xpath is still more performant than writing code to traverse + examine?
|
||||||
|
const originalMessageMarker = doc.evaluate(
|
||||||
|
`//div[. = '-------- Original message --------'] |
|
||||||
|
//div[. = '------ Original Message ------'] |
|
||||||
|
//div[starts-with(., '-----Original Message-----')] |
|
||||||
|
//i[. = '-------Original Message-------'] |
|
||||||
|
//div[. = '---Original---']`,
|
||||||
|
|
||||||
|
doc.body,
|
||||||
|
null,
|
||||||
|
FIRST_ORDERED_NODE_TYPE,
|
||||||
|
null
|
||||||
|
).singleNodeValue;
|
||||||
|
|
||||||
|
if (!originalMessageMarker) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return this._collectAllNodesBelow(originalMessageMarker);
|
||||||
|
}
|
||||||
|
|
||||||
_findQuotesAfterMessageHeaderBlock(doc) {
|
_findQuotesAfterMessageHeaderBlock(doc) {
|
||||||
// This detector looks for a element in the DOM tree containing
|
// This detector looks for a element in the DOM tree containing
|
||||||
// three children: <b>Sent:</b> or <b>Date:</b> and <b>To:</b> and
|
// three children: <b>Sent:</b> or <b>Date:</b> and <b>To:</b> and
|
||||||
|
@ -289,6 +370,8 @@ class QuotedHTMLTransformer {
|
||||||
|
|
||||||
// Find a DOM node exactly matching <b>Sent:</b>
|
// Find a DOM node exactly matching <b>Sent:</b>
|
||||||
const dateXPath = `
|
const dateXPath = `
|
||||||
|
//b[. = 'Sent:'] |
|
||||||
|
//b[. = 'Date:'] |
|
||||||
//b[. = 'Sent: '] |
|
//b[. = 'Sent: '] |
|
||||||
//b[. = 'Date: '] |
|
//b[. = 'Date: '] |
|
||||||
//span[. = 'Sent: '] |
|
//span[. = 'Sent: '] |
|
||||||
|
@ -298,7 +381,10 @@ class QuotedHTMLTransformer {
|
||||||
const dateMarker = doc.evaluate(dateXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null)
|
const dateMarker = doc.evaluate(dateXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null)
|
||||||
.singleNodeValue;
|
.singleNodeValue;
|
||||||
|
|
||||||
if (dateMarker) {
|
if (!dateMarker) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
// check to see if the parent container also contains the other two
|
// check to see if the parent container also contains the other two
|
||||||
const headerContainer = dateMarker.parentElement;
|
const headerContainer = dateMarker.parentElement;
|
||||||
let matches = 0;
|
let matches = 0;
|
||||||
|
@ -308,9 +394,12 @@ class QuotedHTMLTransformer {
|
||||||
matches++;
|
matches++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (matches === 2) {
|
if (matches !== 2) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
// got a hit! let's cut some text.
|
// got a hit! let's cut some text.
|
||||||
const quotedTextNodes = [];
|
const quotedTextNodes = this._collectAllNodesBelow(headerContainer);
|
||||||
|
|
||||||
// Special case to add "From:" because it's often detatched from the rest of the
|
// Special case to add "From:" because it's often detatched from the rest of the
|
||||||
// header fields. We just add it where ever it's located.
|
// header fields. We just add it where ever it's located.
|
||||||
|
@ -325,12 +414,17 @@ class QuotedHTMLTransformer {
|
||||||
quotedTextNodes.push(from);
|
quotedTextNodes.push(from);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return quotedTextNodes;
|
||||||
|
}
|
||||||
|
|
||||||
|
_collectAllNodesBelow = headerContainer => {
|
||||||
// The headers container and everything past it in the document is quoted text.
|
// The headers container and everything past it in the document is quoted text.
|
||||||
// This traverses the DOM, walking up the tree and adding all siblings below
|
// This traverses the DOM, walking up the tree and adding all siblings below
|
||||||
// our current path to the array.
|
// our current path to the array.
|
||||||
let head = headerContainer;
|
let head = headerContainer;
|
||||||
|
let results = [];
|
||||||
while (head) {
|
while (head) {
|
||||||
quotedTextNodes.push(head);
|
results.push(head);
|
||||||
while (head && !head.nextElementSibling) {
|
while (head && !head.nextElementSibling) {
|
||||||
head = head.parentElement;
|
head = head.parentElement;
|
||||||
}
|
}
|
||||||
|
@ -338,11 +432,8 @@ class QuotedHTMLTransformer {
|
||||||
head = head.nextElementSibling;
|
head = head.nextElementSibling;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return quotedTextNodes;
|
return results;
|
||||||
}
|
};
|
||||||
}
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export default new QuotedHTMLTransformer();
|
export default new QuotedHTMLTransformer();
|
||||||
|
|
Loading…
Add table
Reference in a new issue