feat(quote): improved quoted text detection for trailing signatures

This commit is contained in:
Evan Morikawa 2016-11-04 20:45:25 -07:00
parent c0b28456a9
commit dbc81a87a4
8 changed files with 97 additions and 2 deletions

14
spec/fixtures/emails/email_20.html vendored Normal file
View file

@ -0,0 +1,14 @@
<div dir="ltr">Yaaay!&nbsp; So excited :) &nbsp;And no worries, see you in PR, if not before</div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Nov 4, 2016 at 2:07 PM, Evan Morikawa <span dir="ltr">&lt;<a href="mailto:evan@evanmorikawa.com" target="_blank">evan@evanmorikawa.com</a>&gt;</span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Y,<div><div><br></div><div>YES! We&#39;d love to go to Puerto Rico. I just signed up on the site for Nora and I. I&#39;m so sorry for the LONG delay on getting back to you. We had a lot of other commitments up in the air around then. So excited to see you guys in Puerto Rico!</div></div><div><br></div><div>Also, I&#39;m unfortunately in NYC the week of Nov 7th and back in SF week of the 14th, otherwise I&#39;d love to see you here too.</div><div><br></div><div>Evan</div><img class="m_2465269450974321714n1-open" width="0" height="0" style="border:0;width:0;height:0" src="https://link.nylas.com/open/1ocrhlu1fap8935xrnic0cmnm/local-139b0028-d812?r=eWlmYW56aGFuZzJAZ21haWwuY29t"><div class="HOEnZb"><div class="h5">
<div class="gmail_quote m_2465269450974321714nylas-quote m_2465269450974321714nylas-quote-id-92my6rmekrk94aws2clzwhwgy">
<br>
On Nov 3 2016, at 6:56 pm, Y J &lt;<a href="mailto:YJ2@gmail.com" target="_blank">YJ2@gmail.com</a>&gt; wrote:
<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div dir="ltr">Hi Evan &amp; Nora,<div>We&#39;re getting down to the wire and need to send final counts to our vendors <b>tomorrow</b>.&nbsp; If you could please let us know your RSVP for the Dec 10 engagement party and/or the Jan 15 wedding via our website, that would be amazing!&nbsp; Hope to see you soon :)</div><div><br></div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Ftheknot.com%2Fus%2FY-and-geoff&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">theknot.com/us/Y-and-geoff</a><br></div><div><br></div><div>Y &amp; Geoff</div>
<div><div><br></div>-- <br><div><div dir="ltr"><div>Ms. Y J<br><br><div> College 2010<br>AB in Economics</div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Fwww.twitter.com%2FYz&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">@Yz</a></div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Fwww.app.com&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">www.app.com</a></div><div><br></div></div></div></div>
</div>
</div>
<img src="https://YJ2-dot-yamm-track.appspot.com/FireBase?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177" width="1" height="1" alt="beacon" style="display:none;display:none!important">
</blockquote>
</div></div></div></blockquote></div><br><br clear="all"><div><br></div>-- <br><div class="gmail_signature" data-smartmail="gmail_signature"><div dir="ltr"><div>Ms. Y J<br><br><div> College 2010<br>AB in Economics</div><div><a href="http://www.twitter.com/Yz" target="_blank">@Yz</a></div><div><a href="http://www.app.com" target="_blank">www.app.com</a></div><div><br></div></div></div></div>
</div>

View file

@ -0,0 +1,2 @@
<div dir="ltr">Yaaay!&nbsp; So excited :) &nbsp;And no worries, see you in PR, if not before</div><div class="gmail_extra"><br><br><br clear="all"><div><br></div><br>
</div>

16
spec/fixtures/emails/email_21.html vendored Normal file
View file

@ -0,0 +1,16 @@
<div dir="ltr">Yaaay!&nbsp; So excited :) &nbsp;And no worries, see you in PR, if not before</div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Nov 4, 2016 at 2:07 PM, Evan Morikawa <span dir="ltr">&lt;<a href="mailto:evan@evanmorikawa.com" target="_blank">evan@evanmorikawa.com</a>&gt;</span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Y,<div><div><br></div><div>YES! We&#39;d love to go to Puerto Rico. I just signed up on the site for Nora and I. I&#39;m so sorry for the LONG delay on getting back to you. We had a lot of other commitments up in the air around then. So excited to see you guys in Puerto Rico!</div></div><div><br></div><div>Also, I&#39;m unfortunately in NYC the week of Nov 7th and back in SF week of the 14th, otherwise I&#39;d love to see you here too.</div><div><br></div><div>Evan</div><img class="m_2465269450974321714n1-open" width="0" height="0" style="border:0;width:0;height:0" src="https://link.nylas.com/open/1ocrhlu1fap8935xrnic0cmnm/local-139b0028-d812?r=eWlmYW56aGFuZzJAZ21haWwuY29t"><div class="HOEnZb"><div class="h5">
<div class="gmail_quote m_2465269450974321714nylas-quote m_2465269450974321714nylas-quote-id-92my6rmekrk94aws2clzwhwgy">
<br>
On Nov 3 2016, at 6:56 pm, Y J &lt;<a href="mailto:YJ2@gmail.com" target="_blank">YJ2@gmail.com</a>&gt; wrote:
<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div dir="ltr">Hi Evan &amp; Nora,<div>We&#39;re getting down to the wire and need to send final counts to our vendors <b>tomorrow</b>.&nbsp; If you could please let us know your RSVP for the Dec 10 engagement party and/or the Jan 15 wedding via our website, that would be amazing!&nbsp; Hope to see you soon :)</div><div><br></div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Ftheknot.com%2Fus%2FY-and-geoff&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">theknot.com/us/Y-and-geoff</a><br></div><div><br></div><div>Y &amp; Geoff</div>
<div><div><br></div>-- <br><div><div dir="ltr"><div>Ms. Y J<br><br><div> College 2010<br>AB in Economics</div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Fwww.twitter.com%2FYz&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">@Yz</a></div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Fwww.app.com&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">www.app.com</a></div><div><br></div></div></div></div>
</div>
</div>
<img src="https://YJ2-dot-yamm-track.appspot.com/FireBase?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177" width="1" height="1" alt="beacon" style="display:none;display:none!important">
</blockquote>
</div></div></div></blockquote></div><br><br clear="all"><div><br></div>-- <br><div class="gmail_signature" data-smartmail="gmail_signature"><div dir="ltr"><div>Ms. Y J<br><br><div> College 2010<br>AB in Economics</div><div><a href="http://www.twitter.com/Yz" target="_blank">@Yz</a></div><div><a href="http://www.app.com" target="_blank">www.app.com</a></div><div><br></div></div></div></div>
<div>This is some unique text after the signature. It's as if I'm
typing inline. We should NOT collapse this area</div>
</div>

View file

@ -0,0 +1,16 @@
<div dir="ltr">Yaaay!&nbsp; So excited :) &nbsp;And no worries, see you in PR, if not before</div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Nov 4, 2016 at 2:07 PM, Evan Morikawa <span dir="ltr">&lt;<a href="mailto:evan@evanmorikawa.com" target="_blank">evan@evanmorikawa.com</a>&gt;</span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Y,<div><div><br></div><div>YES! We'd love to go to Puerto Rico. I just signed up on the site for Nora and I. I'm so sorry for the LONG delay on getting back to you. We had a lot of other commitments up in the air around then. So excited to see you guys in Puerto Rico!</div></div><div><br></div><div>Also, I'm unfortunately in NYC the week of Nov 7th and back in SF week of the 14th, otherwise I'd love to see you here too.</div><div><br></div><div>Evan</div><img class="m_2465269450974321714n1-open" width="0" height="0" style="border:0;width:0;height:0" src="https://link.nylas.com/open/1ocrhlu1fap8935xrnic0cmnm/local-139b0028-d812?r=eWlmYW56aGFuZzJAZ21haWwuY29t"><div class="HOEnZb"><div class="h5">
<div class="gmail_quote m_2465269450974321714nylas-quote m_2465269450974321714nylas-quote-id-92my6rmekrk94aws2clzwhwgy">
<br>
On Nov 3 2016, at 6:56 pm, Y J &lt;<a href="mailto:YJ2@gmail.com" target="_blank">YJ2@gmail.com</a>&gt; wrote:
<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<div dir="ltr">Hi Evan &amp; Nora,<div>We're getting down to the wire and need to send final counts to our vendors <b>tomorrow</b>.&nbsp; If you could please let us know your RSVP for the Dec 10 engagement party and/or the Jan 15 wedding via our website, that would be amazing!&nbsp; Hope to see you soon :)</div><div><br></div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Ftheknot.com%2Fus%2FY-and-geoff&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">theknot.com/us/Y-and-geoff</a><br></div><div><br></div><div>Y &amp; Geoff</div>
<div><div><br></div>-- <br><div><div dir="ltr"><div>Ms. Y J<br><br><div> College 2010<br>AB in Economics</div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Fwww.twitter.com%2FYz&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">@Yz</a></div><div><a href="https://YJ2-dot-yamm-track.appspot.com/Redirect?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177&amp;link=http%3A%2F%2Fwww.app.com&amp;r=eWlmYW56aGFuZzJAZ21haWwuY29t" target="_blank">www.app.com</a></div><div><br></div></div></div></div>
</div>
</div>
<img src="https://YJ2-dot-yamm-track.appspot.com/FireBase?ukey=1KbnLIl4_8tooUTgmQ_uMogd5HthhVhfP6x6UFR8wq28-0&amp;key=YAMMID-24566177" width="1" height="1" alt="beacon" style="display:none;display:none!important">
</blockquote>
</div></div></div></blockquote></div><br><br clear="all"><div><br></div>-- <br><div class="gmail_signature" data-smartmail="gmail_signature"><div dir="ltr"><div>Ms. Y J<br><br><div> College 2010<br>AB in Economics</div><div><a href="http://www.twitter.com/Yz" target="_blank">@Yz</a></div><div><a href="http://www.app.com" target="_blank">www.app.com</a></div><div><br></div></div></div></div>
<div>This is some unique text after the signature. It's as if I'm
typing inline. We should NOT collapse this area</div>
</div>

View file

@ -19,7 +19,7 @@ describe "QuotedHTMLTransformer", ->
re = new RegExp(QuotedHTMLTransformer.annotationClass, 'g') re = new RegExp(QuotedHTMLTransformer.annotationClass, 'g')
html.match(re)?.length ? 0 html.match(re)?.length ? 0
[1..19].forEach (n) -> [1..21].forEach (n) ->
it "properly parses email_#{n}", -> it "properly parses email_#{n}", ->
opts = keepIfWholeBodyIsQuote: true opts = keepIfWholeBodyIsQuote: true
expect(removeQuotedHTML("email_#{n}.html", opts).trim()).toEqual(readFile("email_#{n}_stripped.html").trim()) expect(removeQuotedHTML("email_#{n}.html", opts).trim()).toEqual(readFile("email_#{n}_stripped.html").trim())
@ -405,7 +405,7 @@ describe "QuotedHTMLTransformer", ->
# `QuotedHTMLTransformer` needs Electron booted up in order to work because # `QuotedHTMLTransformer` needs Electron booted up in order to work because
# of the DOMParser. # of the DOMParser.
xit "Run this simple function to generate output files", -> xit "Run this simple function to generate output files", ->
[19..20].forEach (n) -> [21].forEach (n) ->
newHTML = QuotedHTMLTransformer.removeQuotedHTML(readFile("email_#{n}.html")) newHTML = QuotedHTMLTransformer.removeQuotedHTML(readFile("email_#{n}.html"))
outPath = path.resolve(__dirname, '..', 'fixtures', 'emails', "email_#{n}_raw_stripped.html") outPath = path.resolve(__dirname, '..', 'fixtures', 'emails', "email_#{n}_raw_stripped.html")
fs.writeFileSync(outPath, newHTML) fs.writeFileSync(outPath, newHTML)

View file

@ -10,6 +10,8 @@ import DOMWalkers from '../dom-walkers'
* the document. * the document.
* *
* This is in its own file to make use of ES6 generators * This is in its own file to make use of ES6 generators
*
* See email_19 as a test case for this.
*/ */
export default function quoteStringDetector(doc) { export default function quoteStringDetector(doc) {
const quoteNodesToRemove = []; const quoteNodesToRemove = [];

View file

@ -1,6 +1,7 @@
import _ from 'underscore'; import _ from 'underscore';
import DOMUtils from '../dom-utils'; import DOMUtils from '../dom-utils';
import quoteStringDetector from './quote-string-detector'; import quoteStringDetector from './quote-string-detector';
import unwrappedSignatureDetector from './unwrapped-signature-detector';
class QuotedHTMLTransformer { class QuotedHTMLTransformer {
@ -151,6 +152,7 @@ class QuotedHTMLTransformer {
for (const parser of parsers) { for (const parser of parsers) {
quoteElements = quoteElements.concat(parser(doc) || []); quoteElements = quoteElements.concat(parser(doc) || []);
} }
quoteElements = quoteElements.concat(unwrappedSignatureDetector(doc, quoteElements))
if (!includeInline && quoteElements.length > 0) { if (!includeInline && quoteElements.length > 0) {
// This means we only want to remove quoted text that shows up at the // This means we only want to remove quoted text that shows up at the

View file

@ -0,0 +1,43 @@
import DOMWalkers from '../dom-walkers'
import Utils from '../flux/models/utils'
function textAndNodesAfterNode(node) {
let text = "";
let curNode = node;
const nodes = []
while (curNode) {
let sibling = curNode.nextSibling;
while (sibling) {
text += sibling.textContent;
nodes.push(sibling);
sibling = sibling.nextSibling;
}
curNode = curNode.parentNode;
}
return {text, nodes}
}
/**
* Sometimes the last signature of an email will not be placed in a quote
* block. This will cause out quote detector to not strip anything since
* it looks very similar to someone writing inline regular text after some
* quoted text (which is allowed).
*
* See email_20 and email_21 as a test case for this.
*/
export default function unwrappedSignatureDetector(doc, quoteElements) {
// Find the last quoteBlock
for (const node of DOMWalkers.walkBackwards(doc)) {
if (quoteElements.includes(node)) {
const {text, nodes} = textAndNodesAfterNode(node);
const maybeSig = text.trim();
if (maybeSig.length > 0) {
if ((node.textContent || "").search(Utils.escapeRegExp(maybeSig)) >= 0) {
return nodes;
}
}
break;
}
}
return []
}