# HG changeset patch # User Goffi # Date 1384448027 -3600 # Node ID 6c8a119dcc9418de3e902887a3dffd3602673c6d # Parent 3c304929af74fbc839d47215e2929b591b191b54 plugin text syntaxes: clean_xhtml now accept lxml's HtmlElement to avoid parsing two times the same xml diff -r 3c304929af74 -r 6c8a119dcc94 src/plugins/plugin_misc_text_syntaxes.py --- a/src/plugins/plugin_misc_text_syntaxes.py Thu Nov 14 17:51:35 2013 +0100 +++ b/src/plugins/plugin_misc_text_syntaxes.py Thu Nov 14 17:53:47 2013 +0100 @@ -22,6 +22,7 @@ from wokkel import disco, pubsub from twisted.internet import defer from twisted.internet.threads import deferToThread +from sat.core import exceptions from lxml import html from lxml.html import clean import re @@ -131,7 +132,7 @@ def clean_xhtml(self, xhtml): """ Clean XHTML text by removing potentially dangerous/malicious parts - @param xhtml: raw xhtml text to clean + @param xhtml: raw xhtml text to clean (or lxml's HtmlElement) """ def blocking_cleaning(xhtml): """ Clean XHTML and style attributes """ @@ -157,7 +158,13 @@ cleaned_styles.append((key, value)) return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles]) - xhtml_elt = html.fromstring(xhtml) + if isinstance(xhtml, basestring): + xhtml_elt = html.fromstring(xhtml) + elif isinstance(xhtml, html.HtmlElement): + xhtml_elt = xhtml + else: + error("Only strings and HtmlElements can be cleaned") + raise exceptions.DataError cleaner = clean.Cleaner(style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS) @@ -166,8 +173,7 @@ elt.set("style", clean_style(elt.get('style'))) return html.tostring(xhtml_elt, method='xml') - d = deferToThread(blocking_cleaning, xhtml) - return d + return deferToThread(blocking_cleaning, xhtml) def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True): """ Convert a text between two syntaxes