Improve removal of quoted text added by terrible old mail clients

2025-10-03 01:44:42 +08:00 · 2019-07-31 03:50:56 -05:00 · 2019-07-31 03:50:56 -05:00 · 4f75f7435f
commit 4f75f7435f
parent 87a4853db9
1 changed files with 150 additions and 59 deletions
--- a/app/src/services/quoted-html-transformer.ts
+++ b/app/src/services/quoted-html-transformer.ts
@ -2,13 +2,30 @@ import quoteStringDetector from './quote-string-detector';
 import unwrappedSignatureDetector from './unwrapped-signature-detector';
 const { FIRST_ORDERED_NODE_TYPE } = XPathResult;
 const isEmptyishTextContent = el => {
  // either '' or '---' (often left over from sig / confidentiality notice removal)
  const trimmed = el.textContent.trim();
  return trimmed === '' || /^\-+$/.test(trimmed);
 };
 const looksLikeTrackingPixel = img => {
  // we want to avoid hiding quoted text if the user has added an image beneath it, but only
  // if that image is more than 1px in size...
  const w = Number(img.getAttribute('width') || (img.style.width || '').replace('px', '') || 10000);
  const h = Number(
    img.getAttribute('height') || (img.style.height || '').replace('px', '') || 10000
  );
  return w <= 1 && h <= 1;
 };
 class QuotedHTMLTransformer {
  annotationClass = 'mailspring-quoted-text-segment';
  hasQuotedHTML(html) {
    const doc = this._parseHTML(html);
-    const quoteElements = this._findQuoteElements(doc);
+    this._removeImagesStrippedByAnotherClient(doc);
-    return quoteElements.length > 0;
+    this._removeTrailingFootersAndWhitespace(doc);
    return this._findQuoteElements(doc).length > 0;
  }
  // Public: Removes quoted text from an HTML string
@ -27,15 +44,15 @@ class QuotedHTMLTransformer {
  //
  removeQuotedHTML(html, options = { keepIfWholeBodyIsQuote: true }) {
    const doc = this._parseHTML(html);
-
+    this._removeImagesStrippedByAnotherClient(doc);
    this._removeTrailingFootersAndWhitespace(doc);
    for (const el of this._findQuoteElements(doc)) {
      if (el) {
        el.remove();
      }
    }
-    // It's possible that the entire body was quoted text anyway and we've
+    // It's possible that the entire body was quoted text anyway and we've removed everything.
    // removed everything.
    if (options.keepIfWholeBodyIsQuote) {
      if (!doc.body || !doc.children[0] || doc.body.textContent.trim().length === 0) {
        return this._outputHTMLFor(this._parseHTML(html), { initialHTML: html });
@ -52,7 +69,8 @@ class QuotedHTMLTransformer {
      }
    }
-    this._removeImagesStrippedByAnotherClient(doc);
+    // after removing all the quoted text, delete any whitespace that appeared between blocks
    // so the email doesn't end with <br> x 50
    this._removeUnnecessaryWhitespace(doc);
    return this._outputHTMLFor(doc, { initialHTML: html });
@ -121,14 +139,14 @@ class QuotedHTMLTransformer {
      while (el.lastChild) {
        const child = el.lastChild;
        if (child.nodeType === Node.TEXT_NODE) {
-          if (child.textContent.trim() === '') {
+          if (isEmptyishTextContent(child)) {
            child.remove();
            continue;
          }
        }
        if (['BR', 'P', 'DIV', 'SPAN', 'HR'].includes(child.nodeName)) {
          removeTrailingWhitespaceChildren(child);
-          if (child.childElementCount === 0 && child.textContent.trim() === '') {
+          if (child.childElementCount === 0 && isEmptyishTextContent(child)) {
            child.remove();
            continue;
          }
@ -185,12 +203,12 @@ class QuotedHTMLTransformer {
      this._findGmailQuotes,
      this._findBlockquoteQuotes,
      this._findQuotesAfterMessageHeaderBlock,
-      this._findConfidentialityNotice,
+      this._findQuotesAfter__OriginalMessage__,
    ];
    let quoteElements = [];
    for (const parser of parsers) {
-      quoteElements = quoteElements.concat(parser(doc) || []);
+      quoteElements = quoteElements.concat(parser.call(this, doc) || []);
    }
    // Find top-level nodes that look like a signature - some clients append
@ -238,10 +256,10 @@ class QuotedHTMLTransformer {
        if (node.childNodes) {
          pile.push(...node.childNodes);
        }
-        if (node.nodeName === 'IMG') {
+        if (node.nodeName === 'IMG' && !looksLikeTrackingPixel(node)) {
          return true;
        }
-        if (node.nodeType === Node.TEXT_NODE && node.textContent.trim().length > 0) {
+        if (node.nodeType === Node.TEXT_NODE && !isEmptyishTextContent(node)) {
          return true;
        }
      }
@ -262,26 +280,89 @@ class QuotedHTMLTransformer {
    return Array.from(doc.querySelectorAll('blockquote'));
  }
-  _findConfidentialityNotice(doc) {
+  _removeTrailingFootersAndWhitespace(doc) {
    let els = [];
    let iters = 0;
    while ((els = this._findTrailingFooter(doc))) {
      iters++;
      els.forEach(el => el.remove());
      this._removeUnnecessaryWhitespace(doc);
      if (iters > 20) {
        return;
      }
    }
  }
  _findTrailingFooter(doc) {
    // Traverse from the body down the tree of "last" nodes looking for a
-    // Confidentiality Notice TEXT_NODE. We need to count this node as quoted
+    // Confidentiality Notice, "To unsubscribe from this group", etc.
-    // text or it'll be handled as an inline reply and totally disable quoted
+    // We strip these nodes because otherwise the quoted text logic
-    // text removal.
+    // thinks that they are inline replies to quoted text.
    const footerRegexps = [
      /^Confidentiality Notice/i,
      /strictly confidential/i,
      /This email message is/i,
      /You received this message because/i,
    ];
    let head = doc.body;
    while (head) {
      const tc = head.textContent.trim();
-      if (head.nodeType === Node.TEXT_NODE && tc.startsWith('Confidentiality Notice')) {
+      if (head.nodeType === Node.TEXT_NODE) {
        if (footerRegexps.find(r => r.test(tc))) {
          return [head];
        }
      }
      // chop off Google groups unsubscribe instructions which are appended
      // to the end but annoyingly not in a container.
      if (
        tc === '.' &&
        head.previousSibling &&
        head.previousSibling.previousSibling &&
        head.previousSibling.previousSibling.textContent.trim().startsWith('To unsubscribe')
      ) {
        return [head, head.previousSibling, head.previousSibling.previousSibling];
      }
      // chop off gmail_signature if the user has it configured to go at the absolute
      // bottom of the email
      if (head.nodeName === 'DIV' && head.classList.contains('gmail_signature')) {
        return [head];
      }
      if (head.childNodes.length === 0 && tc === '') {
        head = head.previousSibling;
      } else {
        head = head.lastChild;
      }
    }
    return null;
  }
  _findQuotesAfter__OriginalMessage__(doc) {
    // these are pulled from specific messages seen in the wild. I think that doing this
    // via Xpath is still more performant than writing code to traverse + examine?
    const originalMessageMarker = doc.evaluate(
      `//div[. = '-------- Original message --------'] |
       //div[. = '------ Original Message ------'] |
       //div[starts-with(., '-----Original Message-----')] |
       //i[. = '-------Original Message-------'] |
       //div[. = '---Original---']`,
      doc.body,
      null,
      FIRST_ORDERED_NODE_TYPE,
      null
    ).singleNodeValue;
    if (!originalMessageMarker) {
      return [];
    }
    return this._collectAllNodesBelow(originalMessageMarker);
  }
  _findQuotesAfterMessageHeaderBlock(doc) {
    // This detector looks for a element in the DOM tree containing
    // three children: <b>Sent:</b> or <b>Date:</b> and <b>To:</b> and
@ -289,6 +370,8 @@ class QuotedHTMLTransformer {
    // Find a DOM node exactly matching <b>Sent:</b>
    const dateXPath = `
      //b[. = 'Sent:'] |
      //b[. = 'Date:'] |
      //b[. = 'Sent: '] |
      //b[. = 'Date: '] |
      //span[. = 'Sent: '] |
@ -298,7 +381,10 @@ class QuotedHTMLTransformer {
    const dateMarker = doc.evaluate(dateXPath, doc.body, null, FIRST_ORDERED_NODE_TYPE, null)
      .singleNodeValue;
-    if (dateMarker) {
+    if (!dateMarker) {
      return [];
    }
    // check to see if the parent container also contains the other two
    const headerContainer = dateMarker.parentElement;
    let matches = 0;
@ -308,9 +394,12 @@ class QuotedHTMLTransformer {
        matches++;
      }
    }
-      if (matches === 2) {
+    if (matches !== 2) {
      return [];
    }
    // got a hit! let's cut some text.
-        const quotedTextNodes = [];
+    const quotedTextNodes = this._collectAllNodesBelow(headerContainer);
    // Special case to add "From:" because it's often detatched from the rest of the
    // header fields. We just add it where ever it's located.
@ -325,12 +414,17 @@ class QuotedHTMLTransformer {
      quotedTextNodes.push(from);
    }
    return quotedTextNodes;
  }
  _collectAllNodesBelow = headerContainer => {
    // The headers container and everything past it in the document is quoted text.
    // This traverses the DOM, walking up the tree and adding all siblings below
    // our current path to the array.
    let head = headerContainer;
    let results = [];
    while (head) {
-          quotedTextNodes.push(head);
+      results.push(head);
      while (head && !head.nextElementSibling) {
        head = head.parentElement;
      }
@ -338,11 +432,8 @@ class QuotedHTMLTransformer {
        head = head.nextElementSibling;
      }
    }
-        return quotedTextNodes;
+    return results;
-      }
+  };
    }
    return [];
  }
 }
 export default new QuotedHTMLTransformer();