From e924e74c1b182e1f6bfa759ebfe6e3bab833ac4d Mon Sep 17 00:00:00 2001 From: Christine Spang Date: Tue, 20 Dec 2016 15:10:22 -0800 Subject: [PATCH] [local-sync] Optimize snippet extraction Summary: We were seeing JS blocking in snippet extraction of up to 2k ms. This is because we were walking the entire DOM of a message and extracting all text, regardless of message size---and using our own homegrown DOM walker function. To remedy this, use the standard TreeWalker from the Chrome browser APIs (which in benchmarks looks 2-4x faster) and also exit out of the DOM walking process once we've accumulated enough text to create a snippet. Informal eyeballing of timing metrics for this function suggests the new implementation is something like 10-100x faster for some messages. As a bonus, we get to delete some code and end up with a cleaner implementation! Test Plan: old unit tests yaay Reviewers: juan Reviewed By: juan Subscribers: evan Differential Revision: https://phab.nylas.com/D3543 --- .../local-sync/src/shared/message-factory.js | 51 +++++++++---------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/packages/local-sync/src/shared/message-factory.js b/packages/local-sync/src/shared/message-factory.js index 52fcce9eb..76ecbbb72 100644 --- a/packages/local-sync/src/shared/message-factory.js +++ b/packages/local-sync/src/shared/message-factory.js @@ -37,48 +37,43 @@ function extractContacts(input) { } -// Iteratively walk the DOM of this document's , calling the callback on -// each node. Skip any nodes and the skipTags set, including their children. -function _walkBodyDOM(doc, callback, skipTags) { - let nodes = Array.from(doc.body.childNodes); - - while (nodes.length) { - const node = nodes.shift(); - - callback(node); - - if (!skipTags.has(node.tagName)) { - if (node.childNodes && node.childNodes.length) { - nodes = Array.from(node.childNodes).concat(nodes); - } - } - } -} - - function extractSnippet(plainBody, htmlBody) { - let snippetText = plainBody || ''; + let snippetText = plainBody ? plainBody.trim() : ''; if (htmlBody) { const doc = new DOMParser().parseFromString(htmlBody, 'text/html') - const extractedTextElements = []; + const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']); - _walkBodyDOM(doc, (node) => { + const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => { + if (skipTags.has(node.tagName)) { + // skip this node and all its children + return NodeFilter.FILTER_REJECT; + } if (node.nodeType === Node.TEXT_NODE) { const nodeValue = node.nodeValue ? node.nodeValue.trim() : null; if (nodeValue) { - extractedTextElements.push(nodeValue); + return NodeFilter.FILTER_ACCEPT; } } - }, new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG'])); + return NodeFilter.FILTER_SKIP; + }); - const extractedText = extractedTextElements.join(' ').trim(); - if (extractedText) { - snippetText = extractedText; + let extractedText = ""; + while (treeWalker.nextNode()) { + // TODO: there may be some elements we don't want to add a space between + if (extractedText) { + extractedText += " "; + } + extractedText += treeWalker.currentNode.nodeValue; + if (extractedText.length > SNIPPET_MAX_SIZE) { + break; + } } + + snippetText = extractedText; } // clean up and trim snippet - let trimmed = snippetText.trim().replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE); + let trimmed = snippetText.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE); if (trimmed) { // TODO: strip quoted text from snippets also // trim down to approx. SNIPPET_SIZE w/out cutting off words right in the