mirror of
https://github.com/Foundry376/Mailspring.git
synced 2024-09-22 00:06:06 +08:00
convert(es6): quoted-html-transformer to es6
This commit is contained in:
parent
5a20dfce76
commit
4a40074cd1
|
@ -1,7 +1,7 @@
|
|||
_ = require('underscore')
|
||||
fs = require('fs')
|
||||
path = require 'path'
|
||||
QuotedHTMLTransformer = require('../../src/services/quoted-html-transformer')
|
||||
QuotedHTMLTransformer = require('../../src/services/quoted-html-transformer').default
|
||||
|
||||
describe "QuotedHTMLTransformer", ->
|
||||
|
||||
|
@ -407,5 +407,5 @@ describe "QuotedHTMLTransformer", ->
|
|||
xit "Run this simple function to generate output files", ->
|
||||
[18].forEach (n) ->
|
||||
newHTML = QuotedHTMLTransformer.removeQuotedHTML(readFile("email_#{n}.html"))
|
||||
outPath = path.resolve(__dirname, 'fixtures', 'emails', "email_#{n}_raw_stripped.html")
|
||||
outPath = path.resolve(__dirname, '..', 'fixtures', 'emails', "email_#{n}_raw_stripped.html")
|
||||
fs.writeFileSync(outPath, newHTML)
|
||||
|
|
|
@ -1,216 +0,0 @@
|
|||
_ = require 'underscore'
|
||||
crypto = require 'crypto'
|
||||
DOMUtils = require '../dom-utils'
|
||||
quoteStringDetector = require('./quote-string-detector').default
|
||||
|
||||
class QuotedHTMLTransformer
|
||||
|
||||
annotationClass: "nylas-quoted-text-segment"
|
||||
|
||||
# Given an html string, it will add the `annotationClass` to the DOM
|
||||
# element
|
||||
hideQuotedHTML: (html, {keepIfWholeBodyIsQuote}={}) ->
|
||||
doc = @_parseHTML(html)
|
||||
quoteElements = @_findQuoteLikeElements(doc)
|
||||
unless keepIfWholeBodyIsQuote and @_wholeBodyIsQuote(doc, quoteElements)
|
||||
@_annotateElements(quoteElements)
|
||||
return @_outputHTMLFor(doc, {initialHTML: html})
|
||||
|
||||
hasQuotedHTML: (html) ->
|
||||
doc = @_parseHTML(html)
|
||||
quoteElements = @_findQuoteLikeElements(doc)
|
||||
return quoteElements.length > 0
|
||||
|
||||
# Public: Removes quoted text from an HTML string
|
||||
#
|
||||
# If we find a quoted text region that is "inline" with the root level
|
||||
# message, meaning it has non quoted text before and after it, then we
|
||||
# leave it in the message. If you set the `includeInline` option to true,
|
||||
# then all inline blocks will also be removed.
|
||||
#
|
||||
# - `html` The string full of quoted text areas
|
||||
# - `options`
|
||||
# - `includeInline` Defaults false. If true, inline quotes are removed
|
||||
# too
|
||||
# - `keepIfWholeBodyIsQuote` Defaults false. If true, then it will
|
||||
# check to see if the whole html body is a giant quote. If so, it will
|
||||
# preserve it.
|
||||
#
|
||||
# Returns HTML without quoted text
|
||||
removeQuotedHTML: (html, options={}) ->
|
||||
doc = @_parseHTML(html)
|
||||
quoteElements = @_findQuoteLikeElements(doc, options)
|
||||
unless options.keepIfWholeBodyIsQuote and @_wholeBodyIsQuote(doc, quoteElements)
|
||||
DOMUtils.Mutating.removeElements(quoteElements, options)
|
||||
|
||||
# It's possible that the entire body was quoted text and we've removed everything.
|
||||
if not doc.body
|
||||
return @_outputHTMLFor(@_parseHTML(""), {initialHTML: html})
|
||||
|
||||
@removeTrailingBr(doc)
|
||||
DOMUtils.Mutating.removeElements(quoteStringDetector(doc))
|
||||
if not doc.children[0]
|
||||
return @_outputHTMLFor(@_parseHTML(""), {initialHTML: html})
|
||||
|
||||
if options.returnAsDOM
|
||||
return doc
|
||||
return @_outputHTMLFor(doc, {initialHTML: html})
|
||||
|
||||
# Finds any trailing BR tags and removes them in place
|
||||
removeTrailingBr: (doc) ->
|
||||
childNodes = doc.body.childNodes
|
||||
extraTailBrTags = []
|
||||
for i in [(childNodes.length - 1)..0] by -1
|
||||
curr = childNodes[i]
|
||||
next = childNodes[i - 1]
|
||||
if curr and curr.nodeName == 'BR' and next and next.nodeName == 'BR'
|
||||
extraTailBrTags.push(curr)
|
||||
else
|
||||
break
|
||||
DOMUtils.Mutating.removeElements(extraTailBrTags)
|
||||
|
||||
appendQuotedHTML: (htmlWithoutQuotes, originalHTML) ->
|
||||
doc = @_parseHTML(originalHTML)
|
||||
quoteElements = @_findQuoteLikeElements(doc)
|
||||
doc = @_parseHTML(htmlWithoutQuotes)
|
||||
doc.body.appendChild(node) for node in quoteElements
|
||||
return @_outputHTMLFor(doc, {initialHTML: originalHTML})
|
||||
|
||||
restoreAnnotatedHTML: (html) ->
|
||||
doc = @_parseHTML(html)
|
||||
quoteElements = @_findAnnotatedElements(doc)
|
||||
@_removeAnnotation(quoteElements)
|
||||
return @_outputHTMLFor(doc, {initialHTML: html})
|
||||
|
||||
_parseHTML: (text) ->
|
||||
domParser = new DOMParser()
|
||||
try
|
||||
doc = domParser.parseFromString(text, "text/html")
|
||||
catch error
|
||||
text = "HTML Parser Error: #{error.toString()}"
|
||||
doc = domParser.parseFromString(text, "text/html")
|
||||
NylasEnv.reportError(error)
|
||||
|
||||
# As far as we can tell, when this succeeds, doc /always/ has at least
|
||||
# one child: an <html> node.
|
||||
return doc
|
||||
|
||||
_outputHTMLFor: (doc, {initialHTML}) ->
|
||||
if /<\s?head\s?>/i.test(initialHTML) || /<\s?body[\s>]/i.test(initialHTML)
|
||||
return doc.children[0].innerHTML
|
||||
else
|
||||
return doc.body.innerHTML
|
||||
|
||||
_wholeBodyIsQuote: (doc, quoteElements) ->
|
||||
nonBlankChildElements = []
|
||||
for child in doc.body.childNodes
|
||||
if child.textContent.trim() is ""
|
||||
continue
|
||||
else nonBlankChildElements.push(child)
|
||||
|
||||
if nonBlankChildElements.length is 1
|
||||
return nonBlankChildElements[0] in quoteElements
|
||||
else return false
|
||||
|
||||
# We used to have a scheme where we cached the `doc` object, keyed by
|
||||
# the md5 of the text. Unfortunately we can't do this because the
|
||||
# `doc` is mutated in place. Returning clones of the DOM is just as
|
||||
# bad as re-parsing from string, which is very fast anyway.
|
||||
|
||||
_findQuoteLikeElements: (doc, {includeInline}={}) ->
|
||||
parsers = [
|
||||
@_findGmailQuotes
|
||||
@_findOffice365Quotes
|
||||
@_findBlockquoteQuotes
|
||||
]
|
||||
|
||||
quoteElements = []
|
||||
for parser in parsers
|
||||
quoteElements = quoteElements.concat(parser(doc) ? [])
|
||||
|
||||
if not includeInline and quoteElements.length > 0
|
||||
# This means we only want to remove quoted text that shows up at the
|
||||
# end of a message. If there were non quoted content after, it'd be
|
||||
# inline.
|
||||
|
||||
trailingQuotes = @_findTrailingQuotes(doc, quoteElements)
|
||||
|
||||
# Only keep the trailing quotes so we can delete them.
|
||||
quoteElements = _.intersection(quoteElements, trailingQuotes)
|
||||
|
||||
return _.compact(_.uniq(quoteElements))
|
||||
|
||||
# This will recursievly move through the DOM, bottom to top, and pick
|
||||
# out quoted text blocks. It will stop when it reaches a visible
|
||||
# non-quote text region.
|
||||
_findTrailingQuotes: (scopeElement, quoteElements=[]) ->
|
||||
trailingQuotes = []
|
||||
|
||||
# We need to find only the child nodes that have content in them. We
|
||||
# determine if it's an inline quote based on if there's VISIBLE
|
||||
# content after a piece of quoted text
|
||||
nodesWithContent = DOMUtils.nodesWithContent(scopeElement)
|
||||
|
||||
# There may be multiple quote blocks that are sibilings of each
|
||||
# other at the end of the message. We want to include all of these
|
||||
# trailing quote elements.
|
||||
for nodeWithContent in nodesWithContent by -1
|
||||
if nodeWithContent in quoteElements
|
||||
# This is a valid quote. Let's keep it!
|
||||
#
|
||||
# This quote block may have many more quote blocks inside of it.
|
||||
# Luckily we don't need to explicitly find all of those because
|
||||
# one this block gets removed from the DOM, we'll delete all
|
||||
# sub-quotes as well.
|
||||
trailingQuotes.push(nodeWithContent)
|
||||
continue
|
||||
else
|
||||
moreTrailing = @_findTrailingQuotes(nodeWithContent, quoteElements)
|
||||
trailingQuotes = trailingQuotes.concat(moreTrailing)
|
||||
break
|
||||
|
||||
return trailingQuotes
|
||||
|
||||
_contains: (node, quoteElement) ->
|
||||
node is quoteElement or node.contains(quoteElement)
|
||||
|
||||
_findAnnotatedElements: (doc) ->
|
||||
Array::slice.call(doc.getElementsByClassName(@annotationClass))
|
||||
|
||||
_annotateElements: (elements=[]) ->
|
||||
for el in elements
|
||||
el.classList.add(@annotationClass)
|
||||
originalDisplay = el.style.display
|
||||
el.style.display = "none"
|
||||
el.setAttribute("data-nylas-quoted-text-original-display", originalDisplay)
|
||||
|
||||
_removeAnnotation: (elements=[]) ->
|
||||
for el in elements
|
||||
el.classList.remove(@annotationClass)
|
||||
originalDisplay = el.getAttribute("data-nylas-quoted-text-original-display")
|
||||
el.style.display = originalDisplay
|
||||
el.removeAttribute("data-nylas-quoted-text-original-display")
|
||||
|
||||
_findGmailQuotes: (doc) ->
|
||||
# Gmail creates both div.gmail_quote and blockquote.gmail_quote. The div
|
||||
# version marks text but does not cause indentation, but both should be
|
||||
# considered quoted text.
|
||||
return Array::slice.call(doc.querySelectorAll('.gmail_quote'))
|
||||
|
||||
_findOffice365Quotes: (doc) ->
|
||||
elements = doc.querySelectorAll('#divRplyFwdMsg, #OLK_SRC_BODY_SECTION')
|
||||
elements = Array::slice.call(elements)
|
||||
|
||||
weirdEl = doc.getElementById('3D"divRplyFwdMsg"')
|
||||
if weirdEl then elements.push(weirdEl)
|
||||
|
||||
elements = _.map elements, (el) ->
|
||||
if el.previousElementSibling and el.previousElementSibling.nodeName is "HR"
|
||||
return el.parentElement
|
||||
else return el
|
||||
return elements
|
||||
|
||||
_findBlockquoteQuotes: (doc) ->
|
||||
return Array::slice.call(doc.querySelectorAll('blockquote'))
|
||||
|
||||
module.exports = new QuotedHTMLTransformer
|
260
src/services/quoted-html-transformer.es6
Normal file
260
src/services/quoted-html-transformer.es6
Normal file
|
@ -0,0 +1,260 @@
|
|||
import _ from 'underscore';
|
||||
import DOMUtils from '../dom-utils';
|
||||
import quoteStringDetector from './quote-string-detector';
|
||||
|
||||
class QuotedHTMLTransformer {
|
||||
|
||||
annotationClass = "nylas-quoted-text-segment";
|
||||
|
||||
// Given an html string, it will add the `annotationClass` to the DOM
|
||||
// element
|
||||
hideQuotedHTML(html, {keepIfWholeBodyIsQuote} = {}) {
|
||||
const doc = this._parseHTML(html);
|
||||
const quoteElements = this._findQuoteLikeElements(doc);
|
||||
if (!keepIfWholeBodyIsQuote || !this._wholeBodyIsQuote(doc, quoteElements)) {
|
||||
this._annotateElements(quoteElements);
|
||||
}
|
||||
return this._outputHTMLFor(doc, {initialHTML: html});
|
||||
}
|
||||
|
||||
hasQuotedHTML(html) {
|
||||
const doc = this._parseHTML(html);
|
||||
const quoteElements = this._findQuoteLikeElements(doc);
|
||||
return quoteElements.length > 0;
|
||||
}
|
||||
|
||||
// Public: Removes quoted text from an HTML string
|
||||
//
|
||||
// If we find a quoted text region that is "inline" with the root level
|
||||
// message, meaning it has non quoted text before and after it, then we
|
||||
// leave it in the message. If you set the `includeInline` option to true,
|
||||
// then all inline blocks will also be removed.
|
||||
//
|
||||
// - `html` The string full of quoted text areas
|
||||
// - `options`
|
||||
// - `includeInline` Defaults false. If true, inline quotes are removed
|
||||
// too
|
||||
// - `keepIfWholeBodyIsQuote` Defaults false. If true, then it will
|
||||
// check to see if the whole html body is a giant quote. If so, it will
|
||||
// preserve it.
|
||||
//
|
||||
// Returns HTML without quoted text
|
||||
removeQuotedHTML(html, options = {}) {
|
||||
const doc = this._parseHTML(html);
|
||||
const quoteElements = this._findQuoteLikeElements(doc, options);
|
||||
if (!options.keepIfWholeBodyIsQuote || !this._wholeBodyIsQuote(doc, quoteElements)) {
|
||||
DOMUtils.Mutating.removeElements(quoteElements, options);
|
||||
|
||||
// It's possible that the entire body was quoted text and we've removed everything.
|
||||
if (!doc.body) {
|
||||
return this._outputHTMLFor(this._parseHTML(""), {initialHTML: html});
|
||||
}
|
||||
|
||||
this.removeTrailingBr(doc);
|
||||
DOMUtils.Mutating.removeElements(quoteStringDetector(doc));
|
||||
if (!doc.children[0]) {
|
||||
return this._outputHTMLFor(this._parseHTML(""), {initialHTML: html});
|
||||
}
|
||||
}
|
||||
|
||||
if (options.returnAsDOM) {
|
||||
return doc;
|
||||
}
|
||||
return this._outputHTMLFor(doc, {initialHTML: html});
|
||||
}
|
||||
|
||||
// Finds any trailing BR tags and removes them in place
|
||||
removeTrailingBr(doc) {
|
||||
const { childNodes } = doc.body;
|
||||
const extraTailBrTags = [];
|
||||
for (let i = childNodes.length - 1; i >= 0; i--) {
|
||||
const curr = childNodes[i];
|
||||
const next = childNodes[i - 1];
|
||||
if (curr && curr.nodeName === 'BR' && next && next.nodeName === 'BR') {
|
||||
extraTailBrTags.push(curr);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return DOMUtils.Mutating.removeElements(extraTailBrTags);
|
||||
}
|
||||
|
||||
appendQuotedHTML(htmlWithoutQuotes, originalHTML) {
|
||||
let doc = this._parseHTML(originalHTML);
|
||||
const quoteElements = this._findQuoteLikeElements(doc);
|
||||
doc = this._parseHTML(htmlWithoutQuotes);
|
||||
for (let i = 0; i < quoteElements.length; i++) {
|
||||
const node = quoteElements[i];
|
||||
doc.body.appendChild(node);
|
||||
}
|
||||
return this._outputHTMLFor(doc, {initialHTML: originalHTML});
|
||||
}
|
||||
|
||||
restoreAnnotatedHTML(html) {
|
||||
const doc = this._parseHTML(html);
|
||||
const quoteElements = this._findAnnotatedElements(doc);
|
||||
this._removeAnnotation(quoteElements);
|
||||
return this._outputHTMLFor(doc, {initialHTML: html});
|
||||
}
|
||||
|
||||
_parseHTML(text) {
|
||||
const domParser = new DOMParser();
|
||||
let doc;
|
||||
try {
|
||||
doc = domParser.parseFromString(text, "text/html");
|
||||
} catch (error) {
|
||||
const errText = `HTML Parser Error: ${error.toString()}`;
|
||||
doc = domParser.parseFromString(errText, "text/html");
|
||||
NylasEnv.reportError(error);
|
||||
}
|
||||
|
||||
// As far as we can tell, when this succeeds, doc /always/ has at least
|
||||
// one child: an <html> node.
|
||||
return doc;
|
||||
}
|
||||
|
||||
_outputHTMLFor(doc, {initialHTML}) {
|
||||
if (/<\s?head\s?>/i.test(initialHTML) || /<\s?body[\s>]/i.test(initialHTML)) {
|
||||
return doc.children[0].innerHTML;
|
||||
}
|
||||
return doc.body.innerHTML;
|
||||
}
|
||||
|
||||
_wholeBodyIsQuote(doc, quoteElements) {
|
||||
const nonBlankChildElements = [];
|
||||
for (let i = 0; i < doc.body.childNodes.length; i++) {
|
||||
const child = doc.body.childNodes[i];
|
||||
if (child.textContent.trim() === "") {
|
||||
continue;
|
||||
} else { nonBlankChildElements.push(child); }
|
||||
}
|
||||
|
||||
if (nonBlankChildElements.length === 1) {
|
||||
return Array.from(quoteElements).includes(nonBlankChildElements[0])
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// We used to have a scheme where we cached the `doc` object, keyed by
|
||||
// the md5 of the text. Unfortunately we can't do this because the
|
||||
// `doc` is mutated in place. Returning clones of the DOM is just as
|
||||
// bad as re-parsing from string, which is very fast anyway.
|
||||
|
||||
_findQuoteLikeElements(doc, {includeInline} = {}) {
|
||||
const parsers = [
|
||||
this._findGmailQuotes,
|
||||
this._findOffice365Quotes,
|
||||
this._findBlockquoteQuotes,
|
||||
];
|
||||
|
||||
let quoteElements = [];
|
||||
for (const parser of parsers) {
|
||||
quoteElements = quoteElements.concat(parser(doc) || []);
|
||||
}
|
||||
|
||||
if (!includeInline && quoteElements.length > 0) {
|
||||
// This means we only want to remove quoted text that shows up at the
|
||||
// end of a message. If there were non quoted content after, it'd be
|
||||
// inline.
|
||||
|
||||
const trailingQuotes = this._findTrailingQuotes(doc, quoteElements);
|
||||
|
||||
// Only keep the trailing quotes so we can delete them.
|
||||
quoteElements = _.intersection(quoteElements, trailingQuotes);
|
||||
}
|
||||
|
||||
return _.compact(_.uniq(quoteElements));
|
||||
}
|
||||
|
||||
// This will recursievly move through the DOM, bottom to top, and pick
|
||||
// out quoted text blocks. It will stop when it reaches a visible
|
||||
// non-quote text region.
|
||||
_findTrailingQuotes(scopeElement, quoteElements = []) {
|
||||
let trailingQuotes = [];
|
||||
|
||||
// We need to find only the child nodes that have content in them. We
|
||||
// determine if it's an inline quote based on if there's VISIBLE
|
||||
// content after a piece of quoted text
|
||||
const nodesWithContent = DOMUtils.nodesWithContent(scopeElement);
|
||||
|
||||
// There may be multiple quote blocks that are sibilings of each
|
||||
// other at the end of the message. We want to include all of these
|
||||
// trailing quote elements.
|
||||
for (let i = nodesWithContent.length - 1; i >= 0; i--) {
|
||||
const nodeWithContent = nodesWithContent[i];
|
||||
if (Array.from(quoteElements).includes(nodeWithContent)) {
|
||||
// This is a valid quote. Let's keep it!
|
||||
//
|
||||
// This quote block may have many more quote blocks inside of it.
|
||||
// Luckily we don't need to explicitly find all of those because
|
||||
// one this block gets removed from the DOM, we'll delete all
|
||||
// sub-quotes as well.
|
||||
trailingQuotes.push(nodeWithContent);
|
||||
continue;
|
||||
} else {
|
||||
const moreTrailing = this._findTrailingQuotes(nodeWithContent, quoteElements);
|
||||
trailingQuotes = trailingQuotes.concat(moreTrailing);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return trailingQuotes;
|
||||
}
|
||||
|
||||
_contains(node, quoteElement) {
|
||||
return node === quoteElement || node.contains(quoteElement);
|
||||
}
|
||||
|
||||
_findAnnotatedElements(doc) {
|
||||
return Array.prototype.slice.call(doc.getElementsByClassName(this.annotationClass));
|
||||
}
|
||||
|
||||
_annotateElements(elements = []) {
|
||||
let originalDisplay;
|
||||
return elements.forEach((el) => {
|
||||
el.classList.add(this.annotationClass)
|
||||
originalDisplay = el.style.display
|
||||
el.style.display = "none"
|
||||
el.setAttribute("data-nylas-quoted-text-original-display", originalDisplay);
|
||||
});
|
||||
}
|
||||
|
||||
_removeAnnotation(elements = []) {
|
||||
let originalDisplay;
|
||||
return elements.forEach((el) => {
|
||||
el.classList.remove(this.annotationClass)
|
||||
originalDisplay = el.getAttribute("data-nylas-quoted-text-original-display")
|
||||
el.style.display = originalDisplay
|
||||
el.removeAttribute("data-nylas-quoted-text-original-display");
|
||||
})
|
||||
}
|
||||
|
||||
_findGmailQuotes(doc) {
|
||||
// Gmail creates both div.gmail_quote and blockquote.gmail_quote. The div
|
||||
// version marks text but does not cause indentation, but both should be
|
||||
// considered quoted text.
|
||||
return Array.prototype.slice.call(doc.querySelectorAll('.gmail_quote'));
|
||||
}
|
||||
|
||||
_findOffice365Quotes(doc) {
|
||||
let elements = doc.querySelectorAll('#divRplyFwdMsg, #OLK_SRC_BODY_SECTION');
|
||||
elements = Array.prototype.slice.call(elements);
|
||||
|
||||
const weirdEl = doc.getElementById('3D"divRplyFwdMsg"');
|
||||
if (weirdEl) { elements.push(weirdEl); }
|
||||
|
||||
elements = elements.map((el) => {
|
||||
if (el.previousElementSibling && el.previousElementSibling.nodeName === "HR") {
|
||||
return el.parentElement;
|
||||
}
|
||||
return el
|
||||
});
|
||||
return elements;
|
||||
}
|
||||
|
||||
_findBlockquoteQuotes(doc) {
|
||||
return Array.prototype.slice.call(doc.querySelectorAll('blockquote'));
|
||||
}
|
||||
}
|
||||
|
||||
export default new QuotedHTMLTransformer();
|
Loading…
Reference in a new issue