Mercurial > libervia-backend
comparison sat/plugins/plugin_misc_text_syntaxes.py @ 2786:be8405795e09
plugin text syntaxes: handle empty content in cleanXHTML + don't use self-closing tags for non-void elements.
author | Goffi <goffi@goffi.org> |
---|---|
date | Sat, 19 Jan 2019 11:39:02 +0100 |
parents | b17e6fa1e607 |
children | 148d30147890 |
comparison
equal
deleted
inserted
replaced
2785:f18d8315929e | 2786:be8405795e09 |
---|---|
29 from sat.tools import xml_tools | 29 from sat.tools import xml_tools |
30 | 30 |
31 try: | 31 try: |
32 from lxml import html | 32 from lxml import html |
33 from lxml.html import clean | 33 from lxml.html import clean |
34 from lxml import etree | |
34 except ImportError: | 35 except ImportError: |
35 raise exceptions.MissingModule( | 36 raise exceptions.MissingModule( |
36 u"Missing module lxml, please download/install it from http://lxml.de/" | 37 u"Missing module lxml, please download/install it from http://lxml.de/" |
37 ) | 38 ) |
38 from cgi import escape | 39 from cgi import escape |
93 "volume", | 94 "volume", |
94 "white-space", | 95 "white-space", |
95 "width", | 96 "width", |
96 ) | 97 ) |
97 | 98 |
99 # cf. https://www.w3.org/TR/html/syntax.html#void-elements | |
100 VOID_ELEMENTS = ( | |
101 "area", | |
102 "base", | |
103 "br", | |
104 "col", | |
105 "embed", | |
106 "hr", | |
107 "img", | |
108 "input", | |
109 "keygen", | |
110 "link", | |
111 "menuitem", | |
112 "meta", | |
113 "param", | |
114 "source", | |
115 "track", | |
116 "wbr") | |
117 | |
98 SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) | 118 SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) |
99 STYLES_VALUES_REGEX = ( | 119 STYLES_VALUES_REGEX = ( |
100 r"^(" | 120 r"^(" |
101 + "|".join( | 121 + "|".join( |
102 [ | 122 [ |
277 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean | 297 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean |
278 @return (unicode): cleaned XHTML | 298 @return (unicode): cleaned XHTML |
279 """ | 299 """ |
280 | 300 |
281 if isinstance(xhtml, basestring): | 301 if isinstance(xhtml, basestring): |
282 xhtml_elt = html.fromstring(xhtml) | 302 try: |
303 xhtml_elt = html.fromstring(xhtml) | |
304 except etree.ParserError as e: | |
305 if not xhtml.strip(): | |
306 return u"" | |
307 log.error(u"Can't clean XHTML: {xhtml}".format(xhtml=xhtml)) | |
308 raise e | |
283 elif isinstance(xhtml, html.HtmlElement): | 309 elif isinstance(xhtml, html.HtmlElement): |
284 xhtml_elt = xhtml | 310 xhtml_elt = xhtml |
285 else: | 311 else: |
286 log.error("Only strings and HtmlElements can be cleaned") | 312 log.error("Only strings and HtmlElements can be cleaned") |
287 raise exceptions.DataError | 313 raise exceptions.DataError |
289 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS | 315 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS |
290 ) | 316 ) |
291 xhtml_elt = cleaner.clean_html(xhtml_elt) | 317 xhtml_elt = cleaner.clean_html(xhtml_elt) |
292 for elt in xhtml_elt.xpath("//*[@style]"): | 318 for elt in xhtml_elt.xpath("//*[@style]"): |
293 elt.set("style", self.cleanStyle(elt.get("style"))) | 319 elt.set("style", self.cleanStyle(elt.get("style"))) |
320 # we remove self-closing elements for non-void elements | |
321 for element in xhtml_elt.iter(tag=etree.Element): | |
322 if not element.text: | |
323 if element.tag in VOID_ELEMENTS: | |
324 element.text = None | |
325 else: | |
326 element.text = u'' | |
294 return html.tostring(xhtml_elt, encoding=unicode, method="xml") | 327 return html.tostring(xhtml_elt, encoding=unicode, method="xml") |
295 | 328 |
296 def convert( | 329 def convert( |
297 self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None | 330 self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None |
298 ): | 331 ): |