Mailspring/spec-nylas/quoted-html-parser-spec.coffee
Evan Morikawa 4cad525cfd fix(quoted-text): new system to remove quoted text
Summary:
We need to remove quoted text completely from bodies in a composer.
Unfortunately, that makes it very difficult to determine how to put it
back in.

For now the scheme is to append the quoted text at the end. However, that
means that we need to only pull out quoted text at the end of a message.

Unfortunately there are lots of places that quoted text appears inline
with regular text.

Determining whether or not some content is at the "end" of a message
turned out to be non-trivial.

We now have a new `DOMUtils` that looks for empty areas at the end.

We also have a new quoted HTML parser that finds trailing quotes.

Fixes T2335

Test Plan: lots of new quoted text tests

Reviewers: bengotow

Reviewed By: bengotow

Maniphest Tasks: T2335

Differential Revision: https://phab.nylas.com/D1773
2015-07-21 11:34:47 -07:00

264 lines
6.6 KiB
CoffeeScript

_ = require('underscore')
fs = require('fs')
path = require 'path'
QuotedHTMLParser = require('../src/services/quoted-html-parser')
describe "QuotedHTMLParser", ->
readFile = (fname) ->
emailPath = path.resolve(__dirname, 'fixtures', 'emails', fname)
return fs.readFileSync(emailPath, 'utf8')
hideQuotedHTML = (fname) ->
return QuotedHTMLParser.hideQuotedHTML(readFile(fname))
removeQuotedHTML = (fname) ->
return QuotedHTMLParser.removeQuotedHTML(readFile(fname))
numQuotes = (html) ->
re = new RegExp(QuotedHTMLParser.annotationClass, 'g')
html.match(re)?.length ? 0
[1..15].forEach (n) ->
it "properly parses email_#{n}", ->
expect(removeQuotedHTML("email_#{n}.html")).toEqual readFile("email_#{n}_stripped.html")
describe 'manual quote detection tests', ->
clean = (str) ->
str.replace(/[\n\r]/g, "").replace(/\s{2,}/g, " ")
# The key is the inHTML. The value is the outHTML
tests = []
# Test 1
tests.push
before: """
<div>
Some text
<p>More text</p>
<blockquote id="inline-parent-quote">
Parent
<blockquote id="inline-sub-quote">
Sub
<blockquote id="inline-sub-sub-quote">Sub Sub</blockquote>
Sub
</blockquote>
</blockquote>
<div>Text at end</div>
<blockquote id="last-quote">
<blockquote>
The last quote!
</blockquote>
</blockquote>
</div>
"""
after: """<head></head><body>
<div>
Some text
<p>More text</p>
<blockquote id="inline-parent-quote">
Parent
<blockquote id="inline-sub-quote">
Sub
<blockquote id="inline-sub-sub-quote">Sub Sub</blockquote>
Sub
</blockquote>
</blockquote>
<div>Text at end</div>
</div></body>
"""
# Test 2
tests.push
before: """
<br>
<blockquote>Nothing but quotes</blockquote>
<br>
<br>
"""
after: """<head></head><body>
<br>
<br>
<br></body>
"""
# Test 3: It found the blockquote in another div
tests.push
before: """
<div>Hello World</div>
<br>
<div>
<blockquote>Nothing but quotes</blockquote>
</div>
<br>
<br>
"""
after: """<head></head><body>
<div>Hello World</div>
<br>
<div>
</div>
<br>
<br></body>
"""
# Test 4: It works inside of a wrapped div
tests.push
before: """
<div>
<br>
<blockquote>Nothing but quotes</blockquote>
<br>
<br>
</div>
"""
after: """<head></head><body>
<div>
<br>
<br>
<br>
</div></body>
"""
# Test 5: Inline quotes and text
tests.push
before: """
Hello
<blockquote>Inline quote</blockquote>
World
"""
after: """<head></head><body>
Hello
<blockquote>Inline quote</blockquote>
World</body>
"""
# Test 6: No quoted elements at all
tests.push
before: """
Hello World
"""
after: """<head></head><body>
Hello World</body>
"""
# Test 7: Common ancestor is a quoted node
tests.push
before: """
<div>Content</div>
<blockquote>
Some content
<blockquote>More content</blockquote>
Other content
</blockquote>
"""
after: """<head></head><body>
<div>Content</div></body>
"""
# Test 8: All of our quote blocks we want to remove are at the end…
# sortof… but nested in a bunch of stuff
#
# Note that "content" is burried deep in the middle of a div
tests.push
before: """
<div>Content</div>
<blockquote>
Some content
<blockquote>More content</blockquote>
Other content
</blockquote>
<div>
<blockquote>Some text quote</blockquote>
Some text
<div>
More text
<blockquote>A quote</blockquote>
<br>
</div>
<br>
<blockquote>Another quote</blockquote>
<br>
</div>
<br>
<blockquote>More quotes!</blockquote>
"""
after: """<head></head><body>
<div>Content</div>
<blockquote>
Some content
<blockquote>More content</blockquote>
Other content
</blockquote>
<div>
<blockquote>Some text quote</blockquote>
Some text
<div>
More text
<br>
</div>
<br>
<br>
</div>
<br>
</body>
"""
# Test 9: Last several tags are blockquotes. Note the 3 blockquote
# at the end, the interstital div, and the blockquote inside of the
# first div
tests.push
before: """
<div>
<blockquote>I'm inline</blockquote>
Content
<blockquote>Remove me</blockquote>
</div>
<blockquote>Foo</blockquote>
<div></div>
<blockquote>Bar</blockquote>
<blockquote>Baz</blockquote>
"""
after: """<head></head><body>
<div>
<blockquote>I'm inline</blockquote>
Content
</div>
<div></div></body>
"""
it 'works with these manual test cases', ->
for {before, after} in tests
test = clean(QuotedHTMLParser.removeQuotedHTML(before))
expect(test).toEqual clean(after)
# We have a little utility method that you can manually uncomment to
# generate what the current iteration of the QuotedHTMLParser things the
# `removeQuotedHTML` should look like. These can be manually inspected in
# a browser before getting their filename changed to
# `email_#{n}_stripped.html". The actually tests will run the current
# iteration of the `removeQuotedHTML` against these files to catch if
# anything has changed in the parser.
#
# It's inside of the specs here instaed of its own script because the
# `QuotedHTMLParser` needs Electron booted up in order to work because
# of the DOMParser.
xit "Run this simple funciton to generate output files", ->
[1..15].forEach (n) ->
newHTML = QuotedHTMLParser.removeQuotedHTML(readFile("email_#{n}.html"))
outPath = path.resolve(__dirname, 'fixtures', 'emails', "email_#{n}_raw_stripped.html")
fs.writeFileSync(outPath, newHTML)