# HG changeset patch # User Goffi # Date 1547894342 -3600 # Node ID be8405795e09d81d454b660e5021c29fa8424501 # Parent f18d8315929e73487d5c65a279b1e53556fe239c plugin text syntaxes: handle empty content in cleanXHTML + don't use self-closing tags for non-void elements. diff -r f18d8315929e -r be8405795e09 sat/plugins/plugin_misc_text_syntaxes.py --- a/sat/plugins/plugin_misc_text_syntaxes.py Sat Jan 19 11:39:02 2019 +0100 +++ b/sat/plugins/plugin_misc_text_syntaxes.py Sat Jan 19 11:39:02 2019 +0100 @@ -31,6 +31,7 @@ try: from lxml import html from lxml.html import clean + from lxml import etree except ImportError: raise exceptions.MissingModule( u"Missing module lxml, please download/install it from http://lxml.de/" @@ -95,6 +96,25 @@ "width", ) +# cf. https://www.w3.org/TR/html/syntax.html#void-elements +VOID_ELEMENTS = ( + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "keygen", + "link", + "menuitem", + "meta", + "param", + "source", + "track", + "wbr") + SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) STYLES_VALUES_REGEX = ( r"^(" @@ -279,7 +299,13 @@ """ if isinstance(xhtml, basestring): - xhtml_elt = html.fromstring(xhtml) + try: + xhtml_elt = html.fromstring(xhtml) + except etree.ParserError as e: + if not xhtml.strip(): + return u"" + log.error(u"Can't clean XHTML: {xhtml}".format(xhtml=xhtml)) + raise e elif isinstance(xhtml, html.HtmlElement): xhtml_elt = xhtml else: @@ -291,6 +317,13 @@ xhtml_elt = cleaner.clean_html(xhtml_elt) for elt in xhtml_elt.xpath("//*[@style]"): elt.set("style", self.cleanStyle(elt.get("style"))) + # we remove self-closing elements for non-void elements + for element in xhtml_elt.iter(tag=etree.Element): + if not element.text: + if element.tag in VOID_ELEMENTS: + element.text = None + else: + element.text = u'' return html.tostring(xhtml_elt, encoding=unicode, method="xml") def convert(