feat(quote): add new quoted text detector for quote strings

2025-02-23 23:54:13 +08:00 · 2016-03-03 17:30:17 -08:00 · 2016-03-03 17:30:17 -08:00 · e7ebf0ba2c
commit e7ebf0ba2c
parent 4e05fc45c8
5 changed files with 151 additions and 12 deletions
--- a/spec/quoted-html-transformer-spec.coffee
+++ b/spec/quoted-html-transformer-spec.coffee
@ -297,6 +297,65 @@ describe "QuotedHTMLTransformer", ->
        <br></body>
        """

+    # Test 13: If there's an "On date…" string immediatley before a blockquote,
+    # then remove it.
+    tests.push
+      before: """
+        Hey
+        <div>
+          On FOOBAR
+          <br>
+          On Thu, Mar 3, 2016
+          at 3:19 AM,
+          First Middle Last-Last
+          <span dir="ltr">
+            &lt;
+            <a href="mailto:test@nylas.com" target="_blank">
+              test@nylas.com
+            </a>
+            &gt;
+          </span>
+          wrote:
+          <br>
+          <blockquote>
+            QUOTED TEXT
+          </blockquote>
+        </div>
+        <br>
+      """
+      after: """<head></head><body>
+        Hey
+        <div>
+          On FOOBAR
+          <br><br>
+          </div><br></body>
+      """
+
+    # Test 14: Don't pick up false positives on the string precursors to block
+    # quotes.
+    tests.push
+      before: """
+        Hey
+        <div>
+        On FOOBAR
+        <br>
+        On Thu, Mar 3, 2016 I went to my writing club and wrote:
+        <strong>A little song</strong>
+        <blockquote>
+          QUOTED TEXT
+        </blockquote>
+        </div>
+      """
+      after: """<head></head><body>
+        Hey
+        <div>
+        On FOOBAR
+        <br>
+        On Thu, Mar 3, 2016 I went to my writing club and wrote:
+        <strong>A little song</strong>
+        </div></body>
+      """
+
    it 'works with these manual test cases', ->
      for {before, after} in tests
        opts = keepIfWholeBodyIsQuote: true
--- a/src/dom-utils.coffee
+++ b/src/dom-utils.coffee
@ -312,6 +312,12 @@ DOMUtils =
      else continue
    return lastNode

+  lastDescendent: (node) ->
+    return null unless node
+    if node.childNodes.length > 0
+      return DOMUtils.lastNode(node.childNodes[node.childNodes.length - 1])
+    else return null
+
  findLastTextNode: (node) ->
    return null unless node
    return node if node.nodeType is Node.TEXT_NODE
--- a/src/dom-walkers.es6
+++ b/src/dom-walkers.es6
@ -0,0 +1,23 @@
+const DOMWalkers = {
+  *walk(...treeWalkerArgs) {
+    const walker = document.createTreeWalker(...treeWalkerArgs);
+    let node = walker.nextNode();
+    while (node) {
+      yield node;
+      node = walker.nextNode();
+    }
+    return;
+  },
+
+  *walkBackwards(node) {
+    if (!node) { return; }
+    if (node.childNodes.length > 0) {
+      for (let i = node.childNodes.length - 1; i >= 0; i--) {
+        yield *this.walkBackwards(node.childNodes[i]);
+      }
+    }
+    yield node;
+    return;
+  },
+}
+export default DOMWalkers
--- a/src/services/quote-string-detector.es6
+++ b/src/services/quote-string-detector.es6
@ -0,0 +1,47 @@
+import DOMWalkers from '../dom-walkers'
+
+/*
+ * There are semi-common cases where immediately before a blockquote, we
+ * encounter a string like: "On Thu … so and so … wrote:". This should be part
+ * of the blockquote but was usually left as a collection of nodes. To help
+ * with false-positives, we only look for strings like that that immediately
+ * preceeded the blockquoted section. By the time the function gets here, the
+ * last blockquote has been removed and the text we want will be at the end of
+ * the document.
+ *
+ * This is in its own file to make use of ES6 generators
+ */
+export default function quoteStringDetector(doc) {
+  const quoteNodesToRemove = [];
+  let seenInitialQuoteEnd = false;
+  for (const node of DOMWalkers.walkBackwards(doc)) {
+    if (node.nodeType === Node.TEXT_NODE && node.nodeValue.trim().length > 0) {
+      if (!seenInitialQuoteEnd) {
+        if (/wrote:$/gim.test(node.nodeValue)) {
+          seenInitialQuoteEnd = true;
+          quoteNodesToRemove.push(node);
+          if (/On \S/gim.test(node.nodeValue)) {
+            // The beginning of the quoted string may be in the same node
+            return quoteNodesToRemove;
+          }
+        } else {
+          // This means there's some text in between the end of the content
+          // (adjacent to the blockquote) and the quote string. We shouldn't be
+          // killing any text in this case.
+          return quoteNodesToRemove;
+        }
+      } else {
+        quoteNodesToRemove.push(node)
+        if (/On \S/gim.test(node.nodeValue)) {
+          // This means we've reached the beginning of the quoted string.
+          return quoteNodesToRemove;
+        }
+      }
+    } else {
+      if (seenInitialQuoteEnd) {
+        quoteNodesToRemove.push(node)
+      }
+    }
+  }
+  return quoteNodesToRemove;
+}
--- a/src/services/quoted-html-transformer.coffee
+++ b/src/services/quoted-html-transformer.coffee
@ -1,6 +1,7 @@
 _ = require 'underscore'
 crypto = require 'crypto'
 DOMUtils = require '../dom-utils'
+quoteStringDetector = require './quote-string-detector'

 class QuotedHTMLTransformer

@ -49,19 +50,23 @@ class QuotedHTMLTransformer
      # It's possible that the entire body was quoted text and we've removed everything.
      return "<head></head><body></body>" unless doc.body

-      childNodes = doc.body.childNodes
-      extraTailBrTags = []
-      for i in [(childNodes.length - 1)..0] by -1
-        curr = childNodes[i]
-        next = childNodes[i - 1]
-        if curr and curr.nodeName == 'BR' and next and next.nodeName == 'BR'
-          extraTailBrTags.push(curr)
-        else
-          break
-
-      DOMUtils.Mutating.removeElements(extraTailBrTags)
+      @removeTrailingBr(doc)
+      DOMUtils.Mutating.removeElements(quoteStringDetector(doc))
      return doc.children[0].innerHTML

+  # Finds any trailing BR tags and removes them in place
+  removeTrailingBr: (doc) ->
+    childNodes = doc.body.childNodes
+    extraTailBrTags = []
+    for i in [(childNodes.length - 1)..0] by -1
+      curr = childNodes[i]
+      next = childNodes[i - 1]
+      if curr and curr.nodeName == 'BR' and next and next.nodeName == 'BR'
+        extraTailBrTags.push(curr)
+      else
+        break
+    DOMUtils.Mutating.removeElements(extraTailBrTags)
+
  appendQuotedHTML: (htmlWithoutQuotes, originalHTML) ->
    doc = @_parseHTML(originalHTML)
    quoteElements = @_findQuoteLikeElements(doc)
@ -195,5 +200,4 @@ class QuotedHTMLTransformer
  _findBlockquoteQuotes: (doc) ->
    return Array::slice.call(doc.querySelectorAll('blockquote'))

-
 module.exports = new QuotedHTMLTransformer