Mercurial > libervia-backend
changeset 2786:be8405795e09
plugin text syntaxes: handle empty content in cleanXHTML + don't use self-closing tags for non-void elements.
author | Goffi <goffi@goffi.org> |
---|---|
date | Sat, 19 Jan 2019 11:39:02 +0100 (2019-01-19) |
parents | f18d8315929e |
children | 298408833ec2 |
files | sat/plugins/plugin_misc_text_syntaxes.py |
diffstat | 1 files changed, 34 insertions(+), 1 deletions(-) [+] |
line wrap: on
line diff
--- a/sat/plugins/plugin_misc_text_syntaxes.py Sat Jan 19 11:39:02 2019 +0100 +++ b/sat/plugins/plugin_misc_text_syntaxes.py Sat Jan 19 11:39:02 2019 +0100 @@ -31,6 +31,7 @@ try: from lxml import html from lxml.html import clean + from lxml import etree except ImportError: raise exceptions.MissingModule( u"Missing module lxml, please download/install it from http://lxml.de/" @@ -95,6 +96,25 @@ "width", ) +# cf. https://www.w3.org/TR/html/syntax.html#void-elements +VOID_ELEMENTS = ( + "area", + "base", + "br", + "col", + "embed", + "hr", + "img", + "input", + "keygen", + "link", + "menuitem", + "meta", + "param", + "source", + "track", + "wbr") + SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) STYLES_VALUES_REGEX = ( r"^(" @@ -279,7 +299,13 @@ """ if isinstance(xhtml, basestring): - xhtml_elt = html.fromstring(xhtml) + try: + xhtml_elt = html.fromstring(xhtml) + except etree.ParserError as e: + if not xhtml.strip(): + return u"" + log.error(u"Can't clean XHTML: {xhtml}".format(xhtml=xhtml)) + raise e elif isinstance(xhtml, html.HtmlElement): xhtml_elt = xhtml else: @@ -291,6 +317,13 @@ xhtml_elt = cleaner.clean_html(xhtml_elt) for elt in xhtml_elt.xpath("//*[@style]"): elt.set("style", self.cleanStyle(elt.get("style"))) + # we remove self-closing elements for non-void elements + for element in xhtml_elt.iter(tag=etree.Element): + if not element.text: + if element.tag in VOID_ELEMENTS: + element.text = None + else: + element.text = u'' return html.tostring(xhtml_elt, encoding=unicode, method="xml") def convert(