[local-sync] Optimize snippet extraction

Summary: We were seeing JS blocking in snippet extraction of up to 2k ms. This is because we were walking the entire DOM of a message and extracting all text, regardless of message size---and using our own homegrown DOM walker function. To remedy this, use the standard TreeWalker from the Chrome browser APIs (which in benchmarks looks 2-4x faster) and also exit out of the DOM walking process once we've accumulated enough text to create a snippet. Informal eyeballing of timing metrics for this function suggests the new implementation is something like 10-100x faster for some messages. As a bonus, we get to delete some code and end up with a cleaner implementation! Test Plan: old unit tests yaay Reviewers: juan Reviewed By: juan Subscribers: evan Differential Revision: https://phab.nylas.com/D3543
2024-09-22 08:16:09 +08:00 · 2016-12-20 15:10:22 -08:00 · 2016-12-20 15:10:22 -08:00 · e924e74c1b
parent 5015d105b8
commit e924e74c1b
1 changed files with 23 additions and 28 deletions
--- a/packages/local-sync/src/shared/message-factory.js
+++ b/packages/local-sync/src/shared/message-factory.js
@ -37,48 +37,43 @@ function extractContacts(input) {
 }


-// Iteratively walk the DOM of this document's <body>, calling the callback on
-// each node. Skip any nodes and the skipTags set, including their children.
-function _walkBodyDOM(doc, callback, skipTags) {
-  let nodes = Array.from(doc.body.childNodes);
-
-  while (nodes.length) {
-    const node = nodes.shift();
-
-    callback(node);
-
-    if (!skipTags.has(node.tagName)) {
-      if (node.childNodes && node.childNodes.length) {
-        nodes = Array.from(node.childNodes).concat(nodes);
-      }
-    }
-  }
-}
-
-
 function extractSnippet(plainBody, htmlBody) {
-  let snippetText = plainBody || '';
+  let snippetText = plainBody ? plainBody.trim() : '';
  if (htmlBody) {
    const doc = new DOMParser().parseFromString(htmlBody, 'text/html')
-    const extractedTextElements = [];
+    const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']);

-    _walkBodyDOM(doc, (node) => {
+    const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => {
+      if (skipTags.has(node.tagName)) {
+        // skip this node and all its children
+        return NodeFilter.FILTER_REJECT;
+      }
      if (node.nodeType === Node.TEXT_NODE) {
        const nodeValue = node.nodeValue ? node.nodeValue.trim() : null;
        if (nodeValue) {
-          extractedTextElements.push(nodeValue);
+          return NodeFilter.FILTER_ACCEPT;
        }
      }
-    }, new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']));
+      return NodeFilter.FILTER_SKIP;
+    });

-    const extractedText = extractedTextElements.join(' ').trim();
-    if (extractedText) {
-      snippetText = extractedText;
+    let extractedText = "";
+    while (treeWalker.nextNode()) {
+      // TODO: there may be some elements we don't want to add a space between
+      if (extractedText) {
+        extractedText += " ";
+      }
+      extractedText += treeWalker.currentNode.nodeValue;
+      if (extractedText.length > SNIPPET_MAX_SIZE) {
+        break;
+      }
    }
+
+    snippetText = extractedText;
  }

  // clean up and trim snippet
-  let trimmed = snippetText.trim().replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE);
+  let trimmed = snippetText.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE);
  if (trimmed) {
    // TODO: strip quoted text from snippets also
    // trim down to approx. SNIPPET_SIZE w/out cutting off words right in the