fix(snippet-parsing): Don't add extraneous spaces after text format tags

Summary:
This was leading us to put funny things like 'Nylas !' in some snippets that used
tags like <i> and <b> for text formatting. This is probs a teeny little bit slower
than the previous version since it invokes a callback on a lot more nodes, but we
can't really fix this issue without knowledge of the preceding tag name.

Test Plan: unit test included!!

Reviewers: evan, jackie

Reviewed By: jackie

Differential Revision: https://phab.nylas.com/D3553
This commit is contained in:
Christine Spang 2016-12-22 17:08:18 -08:00
parent 99899be7c2
commit de1b67287c
3 changed files with 25 additions and 10 deletions

View file

@ -1 +1 @@
Finance for our generation . Hi Christine, here's the news you need to know for December 13th . Reading Finance for our generation. Hi Christine, here's the news you need to know for December 13th. Reading

View file

@ -73,6 +73,15 @@ const snippetTestCases = [{
plainBody: null, plainBody: null,
htmlBody: '<p>Unicorns are<!-- an HTML comment! -->native to the</p>', htmlBody: '<p>Unicorns are<!-- an HTML comment! -->native to the</p>',
snippet: 'Unicorns are native to the', snippet: 'Unicorns are native to the',
}, {
purpose: "don't add extraneous spaces after text format markup",
plainBody: null,
htmlBody: `
<td style="padding: 0px 10px">
Hey there, <b>Nylas</b>!<br>
You have a new follower on Product Hunt.
</td>`,
snippet: 'Hey there, Nylas! You have a new follower on Product Hunt.',
}, },
] ]

View file

@ -42,6 +42,7 @@ function extractSnippet(plainBody, htmlBody) {
if (htmlBody) { if (htmlBody) {
const doc = new DOMParser().parseFromString(htmlBody, 'text/html') const doc = new DOMParser().parseFromString(htmlBody, 'text/html')
const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']); const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']);
const noSpaceTags = new Set(['B', 'I', 'STRONG', 'EM', 'SPAN']);
const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => { const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => {
if (skipTags.has(node.tagName)) { if (skipTags.has(node.tagName)) {
@ -53,23 +54,28 @@ function extractSnippet(plainBody, htmlBody) {
if (nodeValue) { if (nodeValue) {
return NodeFilter.FILTER_ACCEPT; return NodeFilter.FILTER_ACCEPT;
} }
return NodeFilter.FILTER_SKIP;
} }
return NodeFilter.FILTER_SKIP; return NodeFilter.FILTER_ACCEPT;
}); });
let extractedText = ""; let extractedText = "";
let lastNodeTag = "";
while (treeWalker.nextNode()) { while (treeWalker.nextNode()) {
// TODO: there may be some elements we don't want to add a space between if (treeWalker.currentNode.nodeType === Node.ELEMENT_NODE) {
if (extractedText) { lastNodeTag = treeWalker.currentNode.nodeName;
extractedText += " "; } else {
} if (extractedText && !noSpaceTags.has(lastNodeTag)) {
extractedText += treeWalker.currentNode.nodeValue; extractedText += " ";
if (extractedText.length > SNIPPET_MAX_SIZE) { }
break; extractedText += treeWalker.currentNode.nodeValue;
if (extractedText.length > SNIPPET_MAX_SIZE) {
break;
}
} }
} }
snippetText = extractedText; snippetText = extractedText.trim();
} }
// clean up and trim snippet // clean up and trim snippet