From e924e74c1b182e1f6bfa759ebfe6e3bab833ac4d Mon Sep 17 00:00:00 2001
From: Christine Spang <spang@nylas.com>
Date: Tue, 20 Dec 2016 15:10:22 -0800
Subject: [PATCH] [local-sync] Optimize snippet extraction

Summary:
We were seeing JS blocking in snippet extraction of up to 2k ms. This
is because we were walking the entire DOM of a message and extracting
all text, regardless of message size---and using our own homegrown
DOM walker function.

To remedy this, use the standard TreeWalker from the Chrome browser
APIs (which in benchmarks looks 2-4x faster) and also exit out of
the DOM walking process once we've accumulated enough text to create
a snippet. Informal eyeballing of timing metrics for this function suggests
the new implementation is something like 10-100x faster for some messages.

As a bonus, we get to delete some code and end up with a cleaner
implementation!

Test Plan: old unit tests yaay

Reviewers: juan

Reviewed By: juan

Subscribers: evan

Differential Revision: https://phab.nylas.com/D3543
---
 .../local-sync/src/shared/message-factory.js  | 51 +++++++++----------
 1 file changed, 23 insertions(+), 28 deletions(-)
diff --git a/packages/local-sync/src/shared/message-factory.js b/packages/local-sync/src/shared/message-factory.js
index 52fcce9eb..76ecbbb72 100644
--- a/packages/local-sync/src/shared/message-factory.js
+++ b/packages/local-sync/src/shared/message-factory.js
@@ -37,48 +37,43 @@ function extractContacts(input) {
 }
 
 
-// Iteratively walk the DOM of this document's <body>, calling the callback on
-// each node. Skip any nodes and the skipTags set, including their children.
-function _walkBodyDOM(doc, callback, skipTags) {
-  let nodes = Array.from(doc.body.childNodes);
-
-  while (nodes.length) {
-    const node = nodes.shift();
-
-    callback(node);
-
-    if (!skipTags.has(node.tagName)) {
-      if (node.childNodes && node.childNodes.length) {
-        nodes = Array.from(node.childNodes).concat(nodes);
-      }
-    }
-  }
-}
-
-
 function extractSnippet(plainBody, htmlBody) {
-  let snippetText = plainBody || '';
+  let snippetText = plainBody ? plainBody.trim() : '';
   if (htmlBody) {
     const doc = new DOMParser().parseFromString(htmlBody, 'text/html')
-    const extractedTextElements = [];
+    const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']);
 
-    _walkBodyDOM(doc, (node) => {
+    const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => {
+      if (skipTags.has(node.tagName)) {
+        // skip this node and all its children
+        return NodeFilter.FILTER_REJECT;
+      }
       if (node.nodeType === Node.TEXT_NODE) {
         const nodeValue = node.nodeValue ? node.nodeValue.trim() : null;
         if (nodeValue) {
-          extractedTextElements.push(nodeValue);
+          return NodeFilter.FILTER_ACCEPT;
         }
       }
-    }, new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']));
+      return NodeFilter.FILTER_SKIP;
+    });
 
-    const extractedText = extractedTextElements.join(' ').trim();
-    if (extractedText) {
-      snippetText = extractedText;
+    let extractedText = "";
+    while (treeWalker.nextNode()) {
+      // TODO: there may be some elements we don't want to add a space between
+      if (extractedText) {
+        extractedText += " ";
+      }
+      extractedText += treeWalker.currentNode.nodeValue;
+      if (extractedText.length > SNIPPET_MAX_SIZE) {
+        break;
+      }
     }
+
+    snippetText = extractedText;
   }
 
   // clean up and trim snippet
-  let trimmed = snippetText.trim().replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE);
+  let trimmed = snippetText.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE);
   if (trimmed) {
     // TODO: strip quoted text from snippets also
     // trim down to approx. SNIPPET_SIZE w/out cutting off words right in the