comparison sat/plugins/plugin_misc_text_syntaxes.py @ 2781:816be0a23877

plugin text syntaxes: cleanStyle is an independent method, cleanXHTML is now blocking (no need to launch thread for that)
author Goffi <goffi@goffi.org>
date Sat, 19 Jan 2019 11:39:02 +0100
parents 85d3240a400f
children b17e6fa1e607
comparison
equal deleted inserted replaced
2780:85d3240a400f 2781:816be0a23877
238 log.error( 238 log.error(
239 u"Error while {action}: {failure}".format(action=action, failure=failure) 239 u"Error while {action}: {failure}".format(action=action, failure=failure)
240 ) 240 )
241 return failure 241 return failure
242 242
243 def cleanStyle(self, styles):
244 """"Clean unsafe CSS styles
245
246 Remove styles not in the whitelist, or where the value doesn't match the regex
247 @param styles_raw(unicode): CSS styles
248 @return (unicode): cleaned styles
249 """
250 styles = styles.split(";")
251 cleaned_styles = []
252 for style in styles:
253 try:
254 key, value = style.split(":")
255 except ValueError:
256 continue
257 key = key.lower().strip()
258 if key not in STYLES_WHITELIST:
259 continue
260 value = value.lower().strip()
261 if not STYLES_ACCEPTED_VALUE.match(value):
262 continue
263 if value == "none":
264 continue
265 cleaned_styles.append((key, value))
266 return "; ".join(
267 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
268 )
269
243 def cleanXHTML(self, xhtml): 270 def cleanXHTML(self, xhtml):
244 """ Clean XHTML text by removing potentially dangerous/malicious parts 271 """Clean XHTML text by removing potentially dangerous/malicious parts
245 @param xhtml: raw xhtml text to clean (or lxml's HtmlElement) 272
246 """ 273 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean
247 274 @return (unicode): cleaned XHTML
248 def blocking_cleaning(xhtml): 275 """
249 """ Clean XHTML and style attributes """ 276
250 277 if isinstance(xhtml, basestring):
251 def clean_style(styles_raw): 278 xhtml_elt = html.fromstring(xhtml)
252 """" Remove styles not in the whitelist, 279 elif isinstance(xhtml, html.HtmlElement):
253 or where the value doesn't match the regex """ 280 xhtml_elt = xhtml
254 styles = styles_raw.split(";") 281 else:
255 cleaned_styles = [] 282 log.error("Only strings and HtmlElements can be cleaned")
256 for style in styles: 283 raise exceptions.DataError
257 try: 284 cleaner = clean.Cleaner(
258 key, value = style.split(":") 285 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
259 except ValueError: 286 )
260 continue 287 xhtml_elt = cleaner.clean_html(xhtml_elt)
261 key = key.lower().strip() 288 for elt in xhtml_elt.xpath("//*[@style]"):
262 if key not in STYLES_WHITELIST: 289 elt.set("style", self.cleanStyle(elt.get("style")))
263 continue 290 return html.tostring(xhtml_elt, encoding=unicode, method="xml")
264 value = value.lower().strip()
265 if not STYLES_ACCEPTED_VALUE.match(value):
266 continue
267 if value == "none":
268 continue
269 cleaned_styles.append((key, value))
270 return "; ".join(
271 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
272 )
273
274 if isinstance(xhtml, basestring):
275 xhtml_elt = html.fromstring(xhtml)
276 elif isinstance(xhtml, html.HtmlElement):
277 xhtml_elt = xhtml
278 else:
279 log.error("Only strings and HtmlElements can be cleaned")
280 raise exceptions.DataError
281 cleaner = clean.Cleaner(
282 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
283 )
284 xhtml_elt = cleaner.clean_html(xhtml_elt)
285 for elt in xhtml_elt.xpath("//*[@style]"):
286 elt.set("style", clean_style(elt.get("style")))
287 return html.tostring(xhtml_elt, encoding=unicode, method="xml")
288
289 d = deferToThread(blocking_cleaning, xhtml)
290 d.addErrback(self._logError, action=u"cleaning syntax")
291 return d
292 291
293 def convert( 292 def convert(
294 self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None 293 self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None
295 ): 294 ):
296 """Convert a text between two syntaxes 295 """Convert a text between two syntaxes
297 296
298 @param text: text to convert 297 @param text: text to convert
299 @param syntax_from: source syntax (e.g. "markdown") 298 @param syntax_from: source syntax (e.g. "markdown")
300 @param syntax_to: dest syntax (e.g.: "XHTML") 299 @param syntax_to: dest syntax (e.g.: "XHTML")
301 @param safe: clean resulting XHTML to avoid malicious code if True 300 @param safe: clean resulting XHTML to avoid malicious code if True
302 @param profile: needed only when syntax_from or syntax_to is set to _SYNTAX_CURRENT 301 @param profile: needed only when syntax_from or syntax_to is set to
302 _SYNTAX_CURRENT
303 @return(unicode): converted text 303 @return(unicode): converted text
304 """ 304 """
305 # FIXME: convert should be abled to handle domish.Element directly 305 # FIXME: convert should be abled to handle domish.Element directly
306 # when dealing with XHTML 306 # when dealing with XHTML
307 # TODO: a way for parser to return parsing errors/warnings 307 # TODO: a way for parser to return parsing errors/warnings