bazarr/libs/markdown/treeprocessors.py

459 lines
16 KiB
Python
Raw Normal View History

"""
Python Markdown
A Python implementation of John Gruber's Markdown.
Documentation: https://python-markdown.github.io/
GitHub: https://github.com/Python-Markdown/markdown/
PyPI: https://pypi.org/project/Markdown/
Started by Manfred Stienstra (http://www.dwerg.net/).
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
Currently maintained by Waylan Limberg (https://github.com/waylan),
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).
Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later)
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
Copyright 2004 Manfred Stienstra (the original version)
License: BSD (see LICENSE.md for details).
"""
2022-11-08 02:06:49 +08:00
import re
import xml.etree.ElementTree as etree
from . import util
from . import inlinepatterns
def build_treeprocessors(md, **kwargs):
""" Build the default treeprocessors for Markdown. """
treeprocessors = util.Registry()
treeprocessors.register(InlineProcessor(md), 'inline', 20)
treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10)
2022-11-08 02:06:49 +08:00
treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0)
return treeprocessors
def isString(s):
""" Check if it's string """
if not isinstance(s, util.AtomicString):
return isinstance(s, str)
return False
class Treeprocessor(util.Processor):
"""
Treeprocessors are run on the ElementTree object before serialization.
Each Treeprocessor implements a "run" method that takes a pointer to an
ElementTree, modifies it as necessary and returns an ElementTree
object.
Treeprocessors must extend markdown.Treeprocessor.
"""
def run(self, root):
"""
Subclasses of Treeprocessor should implement a `run` method, which
takes a root ElementTree. This method can return another ElementTree
object, and the existing root ElementTree will be replaced, or it can
modify the current tree and return None.
"""
pass # pragma: no cover
class InlineProcessor(Treeprocessor):
"""
A Treeprocessor that traverses a tree, applying inline patterns.
"""
def __init__(self, md):
self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
self.__placeholder_suffix = util.ETX
self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
+ len(self.__placeholder_suffix)
self.__placeholder_re = util.INLINE_PLACEHOLDER_RE
self.md = md
self.inlinePatterns = md.inlinePatterns
self.ancestors = []
def __makePlaceholder(self, type):
""" Generate a placeholder """
id = "%04d" % len(self.stashed_nodes)
hash = util.INLINE_PLACEHOLDER % id
return hash, id
def __findPlaceholder(self, data, index):
"""
Extract id from data string, start from index
Keyword arguments:
* data: string
* index: index, from which we start search
Returns: placeholder id and string index, after the found placeholder.
"""
m = self.__placeholder_re.search(data, index)
if m:
return m.group(1), m.end()
else:
return None, index + 1
def __stashNode(self, node, type):
""" Add node to stash """
placeholder, id = self.__makePlaceholder(type)
self.stashed_nodes[id] = node
return placeholder
def __handleInline(self, data, patternIndex=0):
"""
Process string with inline patterns and replace it
with placeholders
Keyword arguments:
* data: A line of Markdown text
* patternIndex: The index of the inlinePattern to start with
Returns: String with placeholders.
"""
if not isinstance(data, util.AtomicString):
startIndex = 0
count = len(self.inlinePatterns)
while patternIndex < count:
data, matched, startIndex = self.__applyPattern(
self.inlinePatterns[patternIndex], data, patternIndex, startIndex
)
if not matched:
patternIndex += 1
return data
def __processElementText(self, node, subnode, isText=True):
"""
Process placeholders in Element.text or Element.tail
of Elements popped from self.stashed_nodes.
Keywords arguments:
* node: parent node
* subnode: processing node
* isText: bool variable, True - it's text, False - it's tail
Returns: None
"""
if isText:
text = subnode.text
subnode.text = None
else:
text = subnode.tail
subnode.tail = None
childResult = self.__processPlaceholders(text, subnode, isText)
if not isText and node is not subnode:
pos = list(node).index(subnode) + 1
else:
pos = 0
childResult.reverse()
for newChild in childResult:
node.insert(pos, newChild[0])
def __processPlaceholders(self, data, parent, isText=True):
"""
Process string with placeholders and generate ElementTree tree.
Keyword arguments:
* data: string with placeholders instead of ElementTree elements.
* parent: Element, which contains processing inline data
Returns: list with ElementTree elements with applied inline patterns.
"""
def linkText(text):
if text:
if result:
if result[-1][0].tail:
result[-1][0].tail += text
else:
result[-1][0].tail = text
elif not isText:
if parent.tail:
parent.tail += text
else:
parent.tail = text
else:
if parent.text:
parent.text += text
else:
parent.text = text
result = []
strartIndex = 0
while data:
index = data.find(self.__placeholder_prefix, strartIndex)
if index != -1:
id, phEndIndex = self.__findPlaceholder(data, index)
if id in self.stashed_nodes:
node = self.stashed_nodes.get(id)
if index > 0:
text = data[strartIndex:index]
linkText(text)
if not isString(node): # it's Element
for child in [node] + list(node):
if child.tail:
if child.tail.strip():
self.__processElementText(
node, child, False
)
if child.text:
if child.text.strip():
self.__processElementText(child, child)
else: # it's just a string
linkText(node)
strartIndex = phEndIndex
continue
strartIndex = phEndIndex
result.append((node, self.ancestors[:]))
else: # wrong placeholder
end = index + len(self.__placeholder_prefix)
linkText(data[strartIndex:end])
strartIndex = end
else:
text = data[strartIndex:]
if isinstance(data, util.AtomicString):
# We don't want to loose the AtomicString
text = util.AtomicString(text)
linkText(text)
data = ""
return result
def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
"""
Check if the line fits the pattern, create the necessary
elements, add it to stashed_nodes.
Keyword arguments:
* data: the text to be processed
* pattern: the pattern to be checked
* patternIndex: index of current pattern
* startIndex: string index, from which we start searching
Returns: String with placeholders instead of ElementTree elements.
"""
new_style = isinstance(pattern, inlinepatterns.InlineProcessor)
for exclude in pattern.ANCESTOR_EXCLUDES:
if exclude.lower() in self.ancestors:
return data, False, 0
if new_style:
match = None
# Since handleMatch may reject our first match,
# we iterate over the buffer looking for matches
# until we can't find any more.
for match in pattern.getCompiledRegExp().finditer(data, startIndex):
node, start, end = pattern.handleMatch(match, data)
if start is None or end is None:
startIndex += match.end(0)
match = None
continue
break
else: # pragma: no cover
match = pattern.getCompiledRegExp().match(data[startIndex:])
leftData = data[:startIndex]
if not match:
return data, False, 0
if not new_style: # pragma: no cover
node = pattern.handleMatch(match)
start = match.start(0)
end = match.end(0)
if node is None:
return data, True, end
if not isString(node):
if not isinstance(node.text, util.AtomicString):
# We need to process current node too
for child in [node] + list(node):
if not isString(node):
if child.text:
self.ancestors.append(child.tag.lower())
child.text = self.__handleInline(
child.text, patternIndex + 1
)
self.ancestors.pop()
if child.tail:
child.tail = self.__handleInline(
child.tail, patternIndex
)
placeholder = self.__stashNode(node, pattern.type())
if new_style:
return "{}{}{}".format(data[:start],
placeholder, data[end:]), True, 0
else: # pragma: no cover
return "{}{}{}{}".format(leftData,
match.group(1),
placeholder, match.groups()[-1]), True, 0
def __build_ancestors(self, parent, parents):
"""Build the ancestor list."""
ancestors = []
while parent is not None:
if parent is not None:
ancestors.append(parent.tag.lower())
parent = self.parent_map.get(parent)
ancestors.reverse()
parents.extend(ancestors)
def run(self, tree, ancestors=None):
"""Apply inline patterns to a parsed Markdown tree.
Iterate over ElementTree, find elements with inline tag, apply inline
patterns and append newly created Elements to tree. If you don't
2022-11-08 02:06:49 +08:00
want to process your data with inline patterns, instead of normal
string, use subclass AtomicString:
node.text = markdown.AtomicString("This will not be processed.")
Arguments:
* tree: ElementTree object, representing Markdown tree.
* ancestors: List of parent tag names that precede the tree node (if needed).
Returns: ElementTree object with applied inline patterns.
"""
self.stashed_nodes = {}
# Ensure a valid parent list, but copy passed in lists
# to ensure we don't have the user accidentally change it on us.
tree_parents = [] if ancestors is None else ancestors[:]
self.parent_map = {c: p for p in tree.iter() for c in p}
stack = [(tree, tree_parents)]
while stack:
currElement, parents = stack.pop()
self.ancestors = parents
self.__build_ancestors(currElement, self.ancestors)
insertQueue = []
for child in currElement:
if child.text and not isinstance(
child.text, util.AtomicString
):
self.ancestors.append(child.tag.lower())
text = child.text
child.text = None
lst = self.__processPlaceholders(
self.__handleInline(text), child
)
for item in lst:
self.parent_map[item[0]] = child
stack += lst
insertQueue.append((child, lst))
self.ancestors.pop()
if child.tail:
tail = self.__handleInline(child.tail)
dumby = etree.Element('d')
child.tail = None
tailResult = self.__processPlaceholders(tail, dumby, False)
if dumby.tail:
child.tail = dumby.tail
pos = list(currElement).index(child) + 1
tailResult.reverse()
for newChild in tailResult:
self.parent_map[newChild[0]] = currElement
currElement.insert(pos, newChild[0])
if len(child):
self.parent_map[child] = currElement
stack.append((child, self.ancestors[:]))
for element, lst in insertQueue:
for i, obj in enumerate(lst):
newChild = obj[0]
element.insert(i, newChild)
return tree
class PrettifyTreeprocessor(Treeprocessor):
""" Add linebreaks to the html document. """
def _prettifyETree(self, elem):
""" Recursively add linebreaks to ElementTree children. """
i = "\n"
if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']:
if (not elem.text or not elem.text.strip()) \
and len(elem) and self.md.is_block_level(elem[0].tag):
elem.text = i
for e in elem:
if self.md.is_block_level(e.tag):
self._prettifyETree(e)
if not elem.tail or not elem.tail.strip():
elem.tail = i
def run(self, root):
""" Add linebreaks to ElementTree root object. """
self._prettifyETree(root)
# Do <br />'s separately as they are often in the middle of
# inline content and missed by _prettifyETree.
brs = root.iter('br')
for br in brs:
if not br.tail or not br.tail.strip():
br.tail = '\n'
else:
br.tail = '\n%s' % br.tail
# Clean up extra empty lines at end of code blocks.
pres = root.iter('pre')
for pre in pres:
if len(pre) and pre[0].tag == 'code':
2022-11-08 02:06:49 +08:00
code = pre[0]
# Only prettify code containing text only
if not len(code) and code.text is not None:
code.text = util.AtomicString(code.text.rstrip() + '\n')
class UnescapeTreeprocessor(Treeprocessor):
""" Restore escaped chars """
RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX))
def _unescape(self, m):
return chr(int(m.group(1)))
def unescape(self, text):
return self.RE.sub(self._unescape, text)
def run(self, root):
""" Loop over all elements and unescape all text. """
for elem in root.iter():
# Unescape text content
if elem.text and not elem.tag == 'code':
elem.text = self.unescape(elem.text)
# Unescape tail content
if elem.tail:
elem.tail = self.unescape(elem.tail)
# Unescape attribute values
for key, value in elem.items():
elem.set(key, self.unescape(value))