From 4cae9a97a38cc912bd3907da7e4cac9217da2213 Mon Sep 17 00:00:00 2001
From: Evan Morikawa
""" + # Test 13: If there's an "On date…" string immediatley before a blockquote, + # then remove it. + tests.push + before: """ + Hey +
+ QUOTED TEXT ++
+
+ """
+ after: """
+ """ + + # Test 14: Don't pick up false positives on the string precursors to block + # quotes. + tests.push + before: """ + Hey +
+ QUOTED TEXT ++
+ """ + after: """
+ """ + it 'works with these manual test cases', -> for {before, after} in tests opts = keepIfWholeBodyIsQuote: true diff --git a/src/dom-utils.coffee b/src/dom-utils.coffee index 303662e49..c99aec573 100644 --- a/src/dom-utils.coffee +++ b/src/dom-utils.coffee @@ -312,6 +312,12 @@ DOMUtils = else continue return lastNode + lastDescendent: (node) -> + return null unless node + if node.childNodes.length > 0 + return DOMUtils.lastNode(node.childNodes[node.childNodes.length - 1]) + else return null + findLastTextNode: (node) -> return null unless node return node if node.nodeType is Node.TEXT_NODE diff --git a/src/dom-walkers.es6 b/src/dom-walkers.es6 new file mode 100644 index 000000000..13c9d3239 --- /dev/null +++ b/src/dom-walkers.es6 @@ -0,0 +1,23 @@ +const DOMWalkers = { + *walk(...treeWalkerArgs) { + const walker = document.createTreeWalker(...treeWalkerArgs); + let node = walker.nextNode(); + while (node) { + yield node; + node = walker.nextNode(); + } + return; + }, + + *walkBackwards(node) { + if (!node) { return; } + if (node.childNodes.length > 0) { + for (let i = node.childNodes.length - 1; i >= 0; i--) { + yield *this.walkBackwards(node.childNodes[i]); + } + } + yield node; + return; + }, +} +export default DOMWalkers diff --git a/src/services/quote-string-detector.es6 b/src/services/quote-string-detector.es6 new file mode 100644 index 000000000..720952f91 --- /dev/null +++ b/src/services/quote-string-detector.es6 @@ -0,0 +1,47 @@ +import DOMWalkers from '../dom-walkers' + +/* + * There are semi-common cases where immediately before a blockquote, we + * encounter a string like: "On Thu … so and so … wrote:". This should be part + * of the blockquote but was usually left as a collection of nodes. To help + * with false-positives, we only look for strings like that that immediately + * preceeded the blockquoted section. By the time the function gets here, the + * last blockquote has been removed and the text we want will be at the end of + * the document. + * + * This is in its own file to make use of ES6 generators + */ +export default function quoteStringDetector(doc) { + const quoteNodesToRemove = []; + let seenInitialQuoteEnd = false; + for (const node of DOMWalkers.walkBackwards(doc)) { + if (node.nodeType === Node.TEXT_NODE && node.nodeValue.trim().length > 0) { + if (!seenInitialQuoteEnd) { + if (/wrote:$/gim.test(node.nodeValue)) { + seenInitialQuoteEnd = true; + quoteNodesToRemove.push(node); + if (/On \S/gim.test(node.nodeValue)) { + // The beginning of the quoted string may be in the same node + return quoteNodesToRemove; + } + } else { + // This means there's some text in between the end of the content + // (adjacent to the blockquote) and the quote string. We shouldn't be + // killing any text in this case. + return quoteNodesToRemove; + } + } else { + quoteNodesToRemove.push(node) + if (/On \S/gim.test(node.nodeValue)) { + // This means we've reached the beginning of the quoted string. + return quoteNodesToRemove; + } + } + } else { + if (seenInitialQuoteEnd) { + quoteNodesToRemove.push(node) + } + } + } + return quoteNodesToRemove; +} diff --git a/src/services/quoted-html-transformer.coffee b/src/services/quoted-html-transformer.coffee index 5b166bc54..fff6b68c6 100644 --- a/src/services/quoted-html-transformer.coffee +++ b/src/services/quoted-html-transformer.coffee @@ -1,6 +1,7 @@ _ = require 'underscore' crypto = require 'crypto' DOMUtils = require '../dom-utils' +quoteStringDetector = require './quote-string-detector' class QuotedHTMLTransformer @@ -49,19 +50,23 @@ class QuotedHTMLTransformer # It's possible that the entire body was quoted text and we've removed everything. return "
" unless doc.body - childNodes = doc.body.childNodes - extraTailBrTags = [] - for i in [(childNodes.length - 1)..0] by -1 - curr = childNodes[i] - next = childNodes[i - 1] - if curr and curr.nodeName == 'BR' and next and next.nodeName == 'BR' - extraTailBrTags.push(curr) - else - break - - DOMUtils.Mutating.removeElements(extraTailBrTags) + @removeTrailingBr(doc) + DOMUtils.Mutating.removeElements(quoteStringDetector(doc)) return doc.children[0].innerHTML + # Finds any trailing BR tags and removes them in place + removeTrailingBr: (doc) -> + childNodes = doc.body.childNodes + extraTailBrTags = [] + for i in [(childNodes.length - 1)..0] by -1 + curr = childNodes[i] + next = childNodes[i - 1] + if curr and curr.nodeName == 'BR' and next and next.nodeName == 'BR' + extraTailBrTags.push(curr) + else + break + DOMUtils.Mutating.removeElements(extraTailBrTags) + appendQuotedHTML: (htmlWithoutQuotes, originalHTML) -> doc = @_parseHTML(originalHTML) quoteElements = @_findQuoteLikeElements(doc) @@ -195,5 +200,4 @@ class QuotedHTMLTransformer _findBlockquoteQuotes: (doc) -> return Array::slice.call(doc.querySelectorAll('blockquote')) - module.exports = new QuotedHTMLTransformer