changeset 2786:be8405795e09

plugin text syntaxes: handle empty content in cleanXHTML + don't use self-closing tags for non-void elements.
author Goffi <goffi@goffi.org>
date Sat, 19 Jan 2019 11:39:02 +0100
parents f18d8315929e
children 298408833ec2
files sat/plugins/plugin_misc_text_syntaxes.py
diffstat 1 files changed, 34 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/sat/plugins/plugin_misc_text_syntaxes.py	Sat Jan 19 11:39:02 2019 +0100
+++ b/sat/plugins/plugin_misc_text_syntaxes.py	Sat Jan 19 11:39:02 2019 +0100
@@ -31,6 +31,7 @@
 try:
     from lxml import html
     from lxml.html import clean
+    from lxml import etree
 except ImportError:
     raise exceptions.MissingModule(
         u"Missing module lxml, please download/install it from http://lxml.de/"
@@ -95,6 +96,25 @@
     "width",
 )
 
+# cf. https://www.w3.org/TR/html/syntax.html#void-elements
+VOID_ELEMENTS = (
+    "area",
+    "base",
+    "br",
+    "col",
+    "embed",
+    "hr",
+    "img",
+    "input",
+    "keygen",
+    "link",
+    "menuitem",
+    "meta",
+    "param",
+    "source",
+    "track",
+    "wbr")
+
 SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls"))
 STYLES_VALUES_REGEX = (
     r"^("
@@ -279,7 +299,13 @@
         """
 
         if isinstance(xhtml, basestring):
-            xhtml_elt = html.fromstring(xhtml)
+            try:
+                xhtml_elt = html.fromstring(xhtml)
+            except etree.ParserError as e:
+                if not xhtml.strip():
+                    return u""
+                log.error(u"Can't clean XHTML: {xhtml}".format(xhtml=xhtml))
+                raise e
         elif isinstance(xhtml, html.HtmlElement):
             xhtml_elt = xhtml
         else:
@@ -291,6 +317,13 @@
         xhtml_elt = cleaner.clean_html(xhtml_elt)
         for elt in xhtml_elt.xpath("//*[@style]"):
             elt.set("style", self.cleanStyle(elt.get("style")))
+        # we remove self-closing elements for non-void elements
+        for element in xhtml_elt.iter(tag=etree.Element):
+            if not element.text:
+                if element.tag in VOID_ELEMENTS:
+                    element.text = None
+                else:
+                    element.text = u''
         return html.tostring(xhtml_elt, encoding=unicode, method="xml")
 
     def convert(