diff src/plugins/plugin_misc_text_syntaxes.py @ 705:6c8a119dcc94

plugin text syntaxes: clean_xhtml now accept lxml's HtmlElement to avoid parsing two times the same xml
author Goffi <goffi@goffi.org>
date Thu, 14 Nov 2013 17:53:47 +0100
parents a25db3fe3959
children 312a2842b2b8
line wrap: on
line diff
--- a/src/plugins/plugin_misc_text_syntaxes.py	Thu Nov 14 17:51:35 2013 +0100
+++ b/src/plugins/plugin_misc_text_syntaxes.py	Thu Nov 14 17:53:47 2013 +0100
@@ -22,6 +22,7 @@
 from wokkel import disco, pubsub
 from twisted.internet import defer
 from twisted.internet.threads import deferToThread
+from sat.core import exceptions
 from lxml import html
 from lxml.html import clean
 import re
@@ -131,7 +132,7 @@
 
     def clean_xhtml(self, xhtml):
         """ Clean XHTML text by removing potentially dangerous/malicious parts
-        @param xhtml: raw xhtml text to clean
+        @param xhtml: raw xhtml text to clean (or lxml's HtmlElement)
         """
         def blocking_cleaning(xhtml):
             """ Clean XHTML and style attributes """
@@ -157,7 +158,13 @@
                     cleaned_styles.append((key, value))
                 return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles])
 
-            xhtml_elt = html.fromstring(xhtml)
+            if isinstance(xhtml, basestring):
+                xhtml_elt = html.fromstring(xhtml)
+            elif isinstance(xhtml, html.HtmlElement):
+                xhtml_elt = xhtml
+            else:
+                error("Only strings and HtmlElements can be cleaned")
+                raise exceptions.DataError
             cleaner = clean.Cleaner(style=False,
                                     add_nofollow=False,
                                     safe_attrs=SAFE_ATTRS)
@@ -166,8 +173,7 @@
                 elt.set("style", clean_style(elt.get('style')))
             return html.tostring(xhtml_elt, method='xml')
 
-        d = deferToThread(blocking_cleaning, xhtml)
-        return d
+        return deferToThread(blocking_cleaning, xhtml)
 
     def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True):
         """ Convert a text between two syntaxes