Broaden search for header blocks, trim trailing <hr>

This commit is contained in:
Ben Gotow 2017-09-25 12:20:14 -07:00
parent 93ea673a1d
commit 9347ef25f7
4 changed files with 83 additions and 7 deletions

50
app/spec/fixtures/emails/email_24.html vendored Normal file
View file

@ -0,0 +1,50 @@
<div id="inbox-html-wrapper"><div style="padding-bottom: 20px;"></div><div>
<meta content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
<title></title>
<div>
<p><span style="color:#1F497D">Pretty much, I
think well have to turn to this later as its low
priority.</span></p>
<p><span style="color:#1F497D">&nbsp;</span></p>
<div>
<p style="margin-bottom:10.0pt;line-height:115%">
<b><span style="font-size:10.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D">
XXXXX<br></span></b><b><span style="font-family:&quot;TimesNewRoman&quot;,&quot;serif&quot;;color:#056063">X,
XXXXX<br></span></b><span style="font-size:10.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:black">X
Avenue of the Americas<br>
New York, NY 10104-3800<br>
</div>
<p><span style="color:#1F497D">&nbsp;</span></p>
<div>
<div style="border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0in 0in 0in">
<p> </p>
</div>
</div>
<p>&nbsp;</p>
<p>&nbsp;</p>
<p>&nbsp;</p>
<p>&nbsp;</p>
<div>
<p style="margin-left:.5in">&nbsp;</p>
</div>
<p style="margin-left:.5in">&nbsp;</p>
<p style="margin-left:.5in">&nbsp;</p>
<p style="mso-margin-top-alt:0in;margin-right:0in;margin-bottom:12.0pt;margin-left:.5in">
&nbsp;</p>
<div style="margin-left:.5in">
<div align="center" style="text-align:center">
<hr size="2" width="100%" align="center"></div></div></div></div></div>

View file

@ -0,0 +1,22 @@
<div id="inbox-html-wrapper"><div style="padding-bottom: 20px;"></div><div>
<meta content="text/html; charset=utf-8">
<meta name="Generator" content="Microsoft Word 14 (filtered medium)">
<title></title>
<div>
<p><span style="color:#1F497D">Pretty much, I
think well have to turn to this later as its low
priority.</span></p>
<p><span style="color:#1F497D">&nbsp;</span></p>
<div>
<p style="margin-bottom:10.0pt;line-height:115%">
<b><span style="font-size:10.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:#1F497D">
XXXXX<br></span></b><b><span style="font-family:&quot;TimesNewRoman&quot;,&quot;serif&quot;;color:#056063">X,
XXXXX<br></span></b><span style="font-size:10.0pt;line-height:115%;font-family:&quot;Arial&quot;,&quot;sans-serif&quot;;color:black">X
Avenue of the Americas<br>
New York, NY 10104-3800</span></p></div></div></div></div>

View file

@ -19,7 +19,7 @@ describe "QuotedHTMLTransformer", ->
re = new RegExp(QuotedHTMLTransformer.annotationClass, 'g')
html.match(re)?.length ? 0
[1..23].forEach (n) ->
[1..24].forEach (n) ->
it "properly parses email_#{n}", ->
opts = keepIfWholeBodyIsQuote: true
expect(removeQuotedHTML("email_#{n}.html", opts).trim()).toEqual(readFile("email_#{n}_stripped.html").trim())

View file

@ -116,7 +116,7 @@ class QuotedHTMLTransformer {
continue;
}
}
if (['BR', 'P', 'DIV', 'SPAN'].includes(child.nodeName)) {
if (['BR', 'P', 'DIV', 'SPAN', 'HR'].includes(child.nodeName)) {
removeTrailingWhitespaceChildren(child);
if ((child.childElementCount === 0) && (child.textContent.trim() === '')) {
child.remove();
@ -377,14 +377,18 @@ class QuotedHTMLTransformer {
_findQuotesAfterMessageHeaderBlock(doc) {
// This detector looks for a element in the DOM tree containing
// three children: <b>Sent:</b> and <b>To:</b> and <b>Subject:</b>.
// It then returns every node after that as quoted text.
// three children: <b>Sent:</b> or <b>Date:</b> and <b>To:</b> and
// <b>Subject:</b>. It then returns every node after that as quoted text.
// Find a DOM node exactly matching <b>Sent:</b>
const to = doc.evaluate("//b[. = 'Sent:']", doc.body, null, XPathResult.ANY_TYPE, null).iterateNext();
if (to) {
const dateMarker = (
doc.evaluate("//b[. = 'Sent:']", doc.body, null, XPathResult.ANY_TYPE, null).iterateNext() ||
doc.evaluate("//b[. = 'Date:']", doc.body, null, XPathResult.ANY_TYPE, null).iterateNext()
);
if (dateMarker) {
// check to see if the parent container also contains the other two
const headerContainer = to.parentElement;
const headerContainer = dateMarker.parentElement;
let matches = 0;
for (const node of Array.from(headerContainer.children)) {
if ((node.textContent === "To:") || (node.textContent === "Subject:")) {