mirror of
https://github.com/Foundry376/Mailspring.git
synced 2024-09-22 08:16:09 +08:00
[local-sync] Optimize snippet extraction
Summary: We were seeing JS blocking in snippet extraction of up to 2k ms. This is because we were walking the entire DOM of a message and extracting all text, regardless of message size---and using our own homegrown DOM walker function. To remedy this, use the standard TreeWalker from the Chrome browser APIs (which in benchmarks looks 2-4x faster) and also exit out of the DOM walking process once we've accumulated enough text to create a snippet. Informal eyeballing of timing metrics for this function suggests the new implementation is something like 10-100x faster for some messages. As a bonus, we get to delete some code and end up with a cleaner implementation! Test Plan: old unit tests yaay Reviewers: juan Reviewed By: juan Subscribers: evan Differential Revision: https://phab.nylas.com/D3543
This commit is contained in:
parent
5015d105b8
commit
e924e74c1b
|
@ -37,48 +37,43 @@ function extractContacts(input) {
|
|||
}
|
||||
|
||||
|
||||
// Iteratively walk the DOM of this document's <body>, calling the callback on
|
||||
// each node. Skip any nodes and the skipTags set, including their children.
|
||||
function _walkBodyDOM(doc, callback, skipTags) {
|
||||
let nodes = Array.from(doc.body.childNodes);
|
||||
|
||||
while (nodes.length) {
|
||||
const node = nodes.shift();
|
||||
|
||||
callback(node);
|
||||
|
||||
if (!skipTags.has(node.tagName)) {
|
||||
if (node.childNodes && node.childNodes.length) {
|
||||
nodes = Array.from(node.childNodes).concat(nodes);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function extractSnippet(plainBody, htmlBody) {
|
||||
let snippetText = plainBody || '';
|
||||
let snippetText = plainBody ? plainBody.trim() : '';
|
||||
if (htmlBody) {
|
||||
const doc = new DOMParser().parseFromString(htmlBody, 'text/html')
|
||||
const extractedTextElements = [];
|
||||
const skipTags = new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']);
|
||||
|
||||
_walkBodyDOM(doc, (node) => {
|
||||
const treeWalker = document.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT, (node) => {
|
||||
if (skipTags.has(node.tagName)) {
|
||||
// skip this node and all its children
|
||||
return NodeFilter.FILTER_REJECT;
|
||||
}
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
const nodeValue = node.nodeValue ? node.nodeValue.trim() : null;
|
||||
if (nodeValue) {
|
||||
extractedTextElements.push(nodeValue);
|
||||
return NodeFilter.FILTER_ACCEPT;
|
||||
}
|
||||
}
|
||||
}, new Set(['TITLE', 'SCRIPT', 'STYLE', 'IMG']));
|
||||
return NodeFilter.FILTER_SKIP;
|
||||
});
|
||||
|
||||
const extractedText = extractedTextElements.join(' ').trim();
|
||||
if (extractedText) {
|
||||
snippetText = extractedText;
|
||||
let extractedText = "";
|
||||
while (treeWalker.nextNode()) {
|
||||
// TODO: there may be some elements we don't want to add a space between
|
||||
if (extractedText) {
|
||||
extractedText += " ";
|
||||
}
|
||||
extractedText += treeWalker.currentNode.nodeValue;
|
||||
if (extractedText.length > SNIPPET_MAX_SIZE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
snippetText = extractedText;
|
||||
}
|
||||
|
||||
// clean up and trim snippet
|
||||
let trimmed = snippetText.trim().replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE);
|
||||
let trimmed = snippetText.replace(/[\n\r]/g, ' ').replace(/\s\s+/g, ' ').substr(0, SNIPPET_MAX_SIZE);
|
||||
if (trimmed) {
|
||||
// TODO: strip quoted text from snippets also
|
||||
// trim down to approx. SNIPPET_SIZE w/out cutting off words right in the
|
||||
|
|
Loading…
Reference in a new issue