comparison sat/plugins/plugin_misc_text_syntaxes.py @ 2786:be8405795e09

plugin text syntaxes: handle empty content in cleanXHTML + don't use self-closing tags for non-void elements.
author Goffi <goffi@goffi.org>
date Sat, 19 Jan 2019 11:39:02 +0100
parents b17e6fa1e607
children 148d30147890
comparison
equal deleted inserted replaced
2785:f18d8315929e 2786:be8405795e09
29 from sat.tools import xml_tools 29 from sat.tools import xml_tools
30 30
31 try: 31 try:
32 from lxml import html 32 from lxml import html
33 from lxml.html import clean 33 from lxml.html import clean
34 from lxml import etree
34 except ImportError: 35 except ImportError:
35 raise exceptions.MissingModule( 36 raise exceptions.MissingModule(
36 u"Missing module lxml, please download/install it from http://lxml.de/" 37 u"Missing module lxml, please download/install it from http://lxml.de/"
37 ) 38 )
38 from cgi import escape 39 from cgi import escape
93 "volume", 94 "volume",
94 "white-space", 95 "white-space",
95 "width", 96 "width",
96 ) 97 )
97 98
99 # cf. https://www.w3.org/TR/html/syntax.html#void-elements
100 VOID_ELEMENTS = (
101 "area",
102 "base",
103 "br",
104 "col",
105 "embed",
106 "hr",
107 "img",
108 "input",
109 "keygen",
110 "link",
111 "menuitem",
112 "meta",
113 "param",
114 "source",
115 "track",
116 "wbr")
117
98 SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) 118 SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls"))
99 STYLES_VALUES_REGEX = ( 119 STYLES_VALUES_REGEX = (
100 r"^(" 120 r"^("
101 + "|".join( 121 + "|".join(
102 [ 122 [
277 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean 297 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean
278 @return (unicode): cleaned XHTML 298 @return (unicode): cleaned XHTML
279 """ 299 """
280 300
281 if isinstance(xhtml, basestring): 301 if isinstance(xhtml, basestring):
282 xhtml_elt = html.fromstring(xhtml) 302 try:
303 xhtml_elt = html.fromstring(xhtml)
304 except etree.ParserError as e:
305 if not xhtml.strip():
306 return u""
307 log.error(u"Can't clean XHTML: {xhtml}".format(xhtml=xhtml))
308 raise e
283 elif isinstance(xhtml, html.HtmlElement): 309 elif isinstance(xhtml, html.HtmlElement):
284 xhtml_elt = xhtml 310 xhtml_elt = xhtml
285 else: 311 else:
286 log.error("Only strings and HtmlElements can be cleaned") 312 log.error("Only strings and HtmlElements can be cleaned")
287 raise exceptions.DataError 313 raise exceptions.DataError
289 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS 315 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
290 ) 316 )
291 xhtml_elt = cleaner.clean_html(xhtml_elt) 317 xhtml_elt = cleaner.clean_html(xhtml_elt)
292 for elt in xhtml_elt.xpath("//*[@style]"): 318 for elt in xhtml_elt.xpath("//*[@style]"):
293 elt.set("style", self.cleanStyle(elt.get("style"))) 319 elt.set("style", self.cleanStyle(elt.get("style")))
320 # we remove self-closing elements for non-void elements
321 for element in xhtml_elt.iter(tag=etree.Element):
322 if not element.text:
323 if element.tag in VOID_ELEMENTS:
324 element.text = None
325 else:
326 element.text = u''
294 return html.tostring(xhtml_elt, encoding=unicode, method="xml") 327 return html.tostring(xhtml_elt, encoding=unicode, method="xml")
295 328
296 def convert( 329 def convert(
297 self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None 330 self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None
298 ): 331 ):