feat(quote): add new quoted text detector for quote strings

This commit is contained in:
Evan Morikawa 2016-03-03 17:30:17 -08:00
parent 4e05fc45c8
commit e7ebf0ba2c
5 changed files with 151 additions and 12 deletions

View file

@ -297,6 +297,65 @@ describe "QuotedHTMLTransformer", ->
<br></body>
"""
# Test 13: If there's an "On date…" string immediatley before a blockquote,
# then remove it.
tests.push
before: """
Hey
<div>
On FOOBAR
<br>
On Thu, Mar 3, 2016
at 3:19 AM,
First Middle Last-Last
<span dir="ltr">
&lt;
<a href="mailto:test@nylas.com" target="_blank">
test@nylas.com
</a>
&gt;
</span>
wrote:
<br>
<blockquote>
QUOTED TEXT
</blockquote>
</div>
<br>
"""
after: """<head></head><body>
Hey
<div>
On FOOBAR
<br><br>
</div><br></body>
"""
# Test 14: Don't pick up false positives on the string precursors to block
# quotes.
tests.push
before: """
Hey
<div>
On FOOBAR
<br>
On Thu, Mar 3, 2016 I went to my writing club and wrote:
<strong>A little song</strong>
<blockquote>
QUOTED TEXT
</blockquote>
</div>
"""
after: """<head></head><body>
Hey
<div>
On FOOBAR
<br>
On Thu, Mar 3, 2016 I went to my writing club and wrote:
<strong>A little song</strong>
</div></body>
"""
it 'works with these manual test cases', ->
for {before, after} in tests
opts = keepIfWholeBodyIsQuote: true

View file

@ -312,6 +312,12 @@ DOMUtils =
else continue
return lastNode
lastDescendent: (node) ->
return null unless node
if node.childNodes.length > 0
return DOMUtils.lastNode(node.childNodes[node.childNodes.length - 1])
else return null
findLastTextNode: (node) ->
return null unless node
return node if node.nodeType is Node.TEXT_NODE

23
src/dom-walkers.es6 Normal file
View file

@ -0,0 +1,23 @@
const DOMWalkers = {
*walk(...treeWalkerArgs) {
const walker = document.createTreeWalker(...treeWalkerArgs);
let node = walker.nextNode();
while (node) {
yield node;
node = walker.nextNode();
}
return;
},
*walkBackwards(node) {
if (!node) { return; }
if (node.childNodes.length > 0) {
for (let i = node.childNodes.length - 1; i >= 0; i--) {
yield *this.walkBackwards(node.childNodes[i]);
}
}
yield node;
return;
},
}
export default DOMWalkers

View file

@ -0,0 +1,47 @@
import DOMWalkers from '../dom-walkers'
/*
* There are semi-common cases where immediately before a blockquote, we
* encounter a string like: "On Thu … so and so … wrote:". This should be part
* of the blockquote but was usually left as a collection of nodes. To help
* with false-positives, we only look for strings like that that immediately
* preceeded the blockquoted section. By the time the function gets here, the
* last blockquote has been removed and the text we want will be at the end of
* the document.
*
* This is in its own file to make use of ES6 generators
*/
export default function quoteStringDetector(doc) {
const quoteNodesToRemove = [];
let seenInitialQuoteEnd = false;
for (const node of DOMWalkers.walkBackwards(doc)) {
if (node.nodeType === Node.TEXT_NODE && node.nodeValue.trim().length > 0) {
if (!seenInitialQuoteEnd) {
if (/wrote:$/gim.test(node.nodeValue)) {
seenInitialQuoteEnd = true;
quoteNodesToRemove.push(node);
if (/On \S/gim.test(node.nodeValue)) {
// The beginning of the quoted string may be in the same node
return quoteNodesToRemove;
}
} else {
// This means there's some text in between the end of the content
// (adjacent to the blockquote) and the quote string. We shouldn't be
// killing any text in this case.
return quoteNodesToRemove;
}
} else {
quoteNodesToRemove.push(node)
if (/On \S/gim.test(node.nodeValue)) {
// This means we've reached the beginning of the quoted string.
return quoteNodesToRemove;
}
}
} else {
if (seenInitialQuoteEnd) {
quoteNodesToRemove.push(node)
}
}
}
return quoteNodesToRemove;
}

View file

@ -1,6 +1,7 @@
_ = require 'underscore'
crypto = require 'crypto'
DOMUtils = require '../dom-utils'
quoteStringDetector = require './quote-string-detector'
class QuotedHTMLTransformer
@ -49,19 +50,23 @@ class QuotedHTMLTransformer
# It's possible that the entire body was quoted text and we've removed everything.
return "<head></head><body></body>" unless doc.body
childNodes = doc.body.childNodes
extraTailBrTags = []
for i in [(childNodes.length - 1)..0] by -1
curr = childNodes[i]
next = childNodes[i - 1]
if curr and curr.nodeName == 'BR' and next and next.nodeName == 'BR'
extraTailBrTags.push(curr)
else
break
DOMUtils.Mutating.removeElements(extraTailBrTags)
@removeTrailingBr(doc)
DOMUtils.Mutating.removeElements(quoteStringDetector(doc))
return doc.children[0].innerHTML
# Finds any trailing BR tags and removes them in place
removeTrailingBr: (doc) ->
childNodes = doc.body.childNodes
extraTailBrTags = []
for i in [(childNodes.length - 1)..0] by -1
curr = childNodes[i]
next = childNodes[i - 1]
if curr and curr.nodeName == 'BR' and next and next.nodeName == 'BR'
extraTailBrTags.push(curr)
else
break
DOMUtils.Mutating.removeElements(extraTailBrTags)
appendQuotedHTML: (htmlWithoutQuotes, originalHTML) ->
doc = @_parseHTML(originalHTML)
quoteElements = @_findQuoteLikeElements(doc)
@ -195,5 +200,4 @@ class QuotedHTMLTransformer
_findBlockquoteQuotes: (doc) ->
return Array::slice.call(doc.querySelectorAll('blockquote'))
module.exports = new QuotedHTMLTransformer