from __future__ import absolute_import, division, unicode_literals import os import json import pytest from .support import get_data_files from html5lib import constants from html5lib.filters.lint import Filter as Lint from html5lib.serializer import HTMLSerializer, serialize from html5lib.treewalkers.base import TreeWalker # pylint:disable=wrong-import-position optionals_loaded = [] try: from lxml import etree optionals_loaded.append("lxml") except ImportError: pass # pylint:enable=wrong-import-position default_namespace = constants.namespaces["html"] class JsonWalker(TreeWalker): def __iter__(self): for token in self.tree: type = token[0] if type == "StartTag": if len(token) == 4: namespace, name, attrib = token[1:4] else: namespace = default_namespace name, attrib = token[1:3] yield self.startTag(namespace, name, self._convertAttrib(attrib)) elif type == "EndTag": if len(token) == 3: namespace, name = token[1:3] else: namespace = default_namespace name = token[1] yield self.endTag(namespace, name) elif type == "EmptyTag": if len(token) == 4: namespace, name, attrib = token[1:] else: namespace = default_namespace name, attrib = token[1:] for token in self.emptyTag(namespace, name, self._convertAttrib(attrib)): yield token elif type == "Comment": yield self.comment(token[1]) elif type in ("Characters", "SpaceCharacters"): for token in self.text(token[1]): yield token elif type == "Doctype": if len(token) == 4: yield self.doctype(token[1], token[2], token[3]) elif len(token) == 3: yield self.doctype(token[1], token[2]) else: yield self.doctype(token[1]) else: raise ValueError("Unknown token type: " + type) def _convertAttrib(self, attribs): """html5lib tree-walkers use a dict of (namespace, name): value for attributes, but JSON cannot represent this. Convert from the format in the serializer tests (a list of dicts with "namespace", "name", and "value" as keys) to html5lib's tree-walker format.""" attrs = {} for attrib in attribs: name = (attrib["namespace"], attrib["name"]) assert(name not in attrs) attrs[name] = attrib["value"] return attrs def serialize_html(input, options): options = dict([(str(k), v) for k, v in options.items()]) encoding = options.get("encoding", None) if "encoding" in options: del options["encoding"] stream = Lint(JsonWalker(input), False) serializer = HTMLSerializer(alphabetical_attributes=True, **options) return serializer.render(stream, encoding) def runSerializerTest(input, expected, options): encoding = options.get("encoding", None) if encoding: expected = list(map(lambda x: x.encode(encoding), expected)) result = serialize_html(input, options) if len(expected) == 1: assert expected[0] == result, "Expected:\n%s\nActual:\n%s\nOptions:\n%s" % (expected[0], result, str(options)) elif result not in expected: assert False, "Expected: %s, Received: %s" % (expected, result) def throwsWithLatin1(input): with pytest.raises(UnicodeEncodeError): serialize_html(input, {"encoding": "iso-8859-1"}) def testDoctypeName(): throwsWithLatin1([["Doctype", "\u0101"]]) def testDoctypePublicId(): throwsWithLatin1([["Doctype", "potato", "\u0101"]]) def testDoctypeSystemId(): throwsWithLatin1([["Doctype", "potato", "potato", "\u0101"]]) def testCdataCharacters(): runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "style", {}], ["Characters", "\u0101"]], ["<style>ā"], {"encoding": "iso-8859-1"}) def testCharacters(): runSerializerTest([["Characters", "\u0101"]], ["ā"], {"encoding": "iso-8859-1"}) def testStartTagName(): throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "\u0101", []]]) def testAttributeName(): throwsWithLatin1([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "\u0101", "value": "potato"}]]]) def testAttributeValue(): runSerializerTest([["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "potato", "value": "\u0101"}]]], ["<span potato=ā>"], {"encoding": "iso-8859-1"}) def testEndTagName(): throwsWithLatin1([["EndTag", "http://www.w3.org/1999/xhtml", "\u0101"]]) def testComment(): throwsWithLatin1([["Comment", "\u0101"]]) def testThrowsUnknownOption(): with pytest.raises(TypeError): HTMLSerializer(foobar=None) @pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`")) def testSpecQuoteAttribute(c): input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "foo", "value": c}]]] if c == '"': output_ = ["<span foo='%s'>" % c] else: output_ = ['<span foo="%s">' % c] options_ = {"quote_attr_values": "spec"} runSerializerTest(input_, output_, options_) @pytest.mark.parametrize("c", list("\t\n\u000C\x20\r\"'=<>`" "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" "\u3000")) def testLegacyQuoteAttribute(c): input_ = [["StartTag", "http://www.w3.org/1999/xhtml", "span", [{"namespace": None, "name": "foo", "value": c}]]] if c == '"': output_ = ["<span foo='%s'>" % c] else: output_ = ['<span foo="%s">' % c] options_ = {"quote_attr_values": "legacy"} runSerializerTest(input_, output_, options_) @pytest.fixture def lxml_parser(): return etree.XMLParser(resolve_entities=False) @pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable") def testEntityReplacement(lxml_parser): doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>' tree = etree.fromstring(doc, parser=lxml_parser).getroottree() result = serialize(tree, tree="lxml", omit_optional_tags=False) assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>\u03B2</html>' @pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable") def testEntityXML(lxml_parser): doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>' tree = etree.fromstring(doc, parser=lxml_parser).getroottree() result = serialize(tree, tree="lxml", omit_optional_tags=False) assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>></html>' @pytest.mark.skipif("lxml" not in optionals_loaded, reason="lxml not importable") def testEntityNoResolve(lxml_parser): doc = '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>' tree = etree.fromstring(doc, parser=lxml_parser).getroottree() result = serialize(tree, tree="lxml", omit_optional_tags=False, resolve_entities=False) assert result == '<!DOCTYPE html SYSTEM "about:legacy-compat"><html>β</html>' def test_serializer(): for filename in get_data_files('serializer-testdata', '*.test', os.path.dirname(__file__)): with open(filename) as fp: tests = json.load(fp) for test in tests['tests']: yield runSerializerTest, test["input"], test["expected"], test.get("options", {})