bazarr/libs/apprise/conversion.py

202 lines
6.3 KiB
Python

# -*- coding: utf-8 -*-
# BSD 3-Clause License
#
# Apprise - Push Notification Library.
# Copyright (c) 2023, Chris Caron <lead2gold@gmail.com>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
import re
from markdown import markdown
from .common import NotifyFormat
from .URLBase import URLBase
from html.parser import HTMLParser
def convert_between(from_format, to_format, content):
"""
Converts between different suported formats. If no conversion exists,
or the selected one fails, the original text will be returned.
This function returns the content translated (if required)
"""
converters = {
(NotifyFormat.MARKDOWN, NotifyFormat.HTML): markdown_to_html,
(NotifyFormat.TEXT, NotifyFormat.HTML): text_to_html,
(NotifyFormat.HTML, NotifyFormat.TEXT): html_to_text,
# For now; use same converter for Markdown support
(NotifyFormat.HTML, NotifyFormat.MARKDOWN): html_to_text,
}
convert = converters.get((from_format, to_format))
return convert(content) if convert else content
def markdown_to_html(content):
"""
Converts specified content from markdown to HTML.
"""
return markdown(content)
def text_to_html(content):
"""
Converts specified content from plain text to HTML.
"""
# First eliminate any carriage returns
return URLBase.escape_html(content, convert_new_lines=True)
def html_to_text(content):
"""
Converts a content from HTML to plain text.
"""
parser = HTMLConverter()
parser.feed(content)
parser.close()
return parser.converted
class HTMLConverter(HTMLParser, object):
"""An HTML to plain text converter tuned for email messages."""
# The following tags must start on a new line
BLOCK_TAGS = ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'div', 'td', 'th', 'code', 'pre', 'label', 'li',)
# the folowing tags ignore any internal text
IGNORE_TAGS = (
'form', 'input', 'textarea', 'select', 'ul', 'ol', 'style', 'link',
'meta', 'title', 'html', 'head', 'script')
# Condense Whitespace
WS_TRIM = re.compile(r'[\s]+', re.DOTALL | re.MULTILINE)
# Sentinel value for block tag boundaries, which may be consolidated into a
# single line break.
BLOCK_END = {}
def __init__(self, **kwargs):
super().__init__(**kwargs)
# Shoudl we store the text content or not?
self._do_store = True
# Initialize internal result list
self._result = []
# Initialize public result field (not populated until close() is
# called)
self.converted = ""
def close(self):
string = ''.join(self._finalize(self._result))
self.converted = string.strip()
def _finalize(self, result):
"""
Combines and strips consecutive strings, then converts consecutive
block ends into singleton newlines.
[ {be} " Hello " {be} {be} " World!" ] -> "\nHello\nWorld!"
"""
# None means the last visited item was a block end.
accum = None
for item in result:
if item == self.BLOCK_END:
# Multiple consecutive block ends; do nothing.
if accum is None:
continue
# First block end; yield the current string, plus a newline.
yield accum.strip() + '\n'
accum = None
# Multiple consecutive strings; combine them.
elif accum is not None:
accum += item
# First consecutive string; store it.
else:
accum = item
# Yield the last string if we have not already done so.
if accum is not None:
yield accum.strip()
def handle_data(self, data, *args, **kwargs):
"""
Store our data if it is not on the ignore list
"""
# initialize our previous flag
if self._do_store:
# Tidy our whitespace
content = self.WS_TRIM.sub(' ', data)
self._result.append(content)
def handle_starttag(self, tag, attrs):
"""
Process our starting HTML Tag
"""
# Toggle initial states
self._do_store = tag not in self.IGNORE_TAGS
if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)
if tag == 'li':
self._result.append('- ')
elif tag == 'br':
self._result.append('\n')
elif tag == 'hr':
if self._result:
self._result[-1] = self._result[-1].rstrip(' ')
self._result.append('\n---\n')
elif tag == 'blockquote':
self._result.append(' >')
def handle_endtag(self, tag):
"""
Edge case handling of open/close tags
"""
self._do_store = True
if tag in self.BLOCK_TAGS:
self._result.append(self.BLOCK_END)