comparison src/plugins/plugin_misc_text_syntaxes.py @ 705:6c8a119dcc94

plugin text syntaxes: clean_xhtml now accept lxml's HtmlElement to avoid parsing two times the same xml
author Goffi <goffi@goffi.org>
date Thu, 14 Nov 2013 17:53:47 +0100
parents a25db3fe3959
children 312a2842b2b8
comparison
equal deleted inserted replaced
704:3c304929af74 705:6c8a119dcc94
20 from logging import debug, info, error, warning 20 from logging import debug, info, error, warning
21 21
22 from wokkel import disco, pubsub 22 from wokkel import disco, pubsub
23 from twisted.internet import defer 23 from twisted.internet import defer
24 from twisted.internet.threads import deferToThread 24 from twisted.internet.threads import deferToThread
25 from sat.core import exceptions
25 from lxml import html 26 from lxml import html
26 from lxml.html import clean 27 from lxml.html import clean
27 import re 28 import re
28 29
29 30
129 """ 130 """
130 return self.host.memory.getParamA(NAME, CATEGORY , profile_key=profile) 131 return self.host.memory.getParamA(NAME, CATEGORY , profile_key=profile)
131 132
132 def clean_xhtml(self, xhtml): 133 def clean_xhtml(self, xhtml):
133 """ Clean XHTML text by removing potentially dangerous/malicious parts 134 """ Clean XHTML text by removing potentially dangerous/malicious parts
134 @param xhtml: raw xhtml text to clean 135 @param xhtml: raw xhtml text to clean (or lxml's HtmlElement)
135 """ 136 """
136 def blocking_cleaning(xhtml): 137 def blocking_cleaning(xhtml):
137 """ Clean XHTML and style attributes """ 138 """ Clean XHTML and style attributes """
138 139
139 def clean_style(styles_raw): 140 def clean_style(styles_raw):
155 if value == "none": 156 if value == "none":
156 continue 157 continue
157 cleaned_styles.append((key, value)) 158 cleaned_styles.append((key, value))
158 return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles]) 159 return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles])
159 160
160 xhtml_elt = html.fromstring(xhtml) 161 if isinstance(xhtml, basestring):
162 xhtml_elt = html.fromstring(xhtml)
163 elif isinstance(xhtml, html.HtmlElement):
164 xhtml_elt = xhtml
165 else:
166 error("Only strings and HtmlElements can be cleaned")
167 raise exceptions.DataError
161 cleaner = clean.Cleaner(style=False, 168 cleaner = clean.Cleaner(style=False,
162 add_nofollow=False, 169 add_nofollow=False,
163 safe_attrs=SAFE_ATTRS) 170 safe_attrs=SAFE_ATTRS)
164 xhtml_elt = cleaner.clean_html(xhtml_elt) 171 xhtml_elt = cleaner.clean_html(xhtml_elt)
165 for elt in xhtml_elt.xpath("//*[@style]"): 172 for elt in xhtml_elt.xpath("//*[@style]"):
166 elt.set("style", clean_style(elt.get('style'))) 173 elt.set("style", clean_style(elt.get('style')))
167 return html.tostring(xhtml_elt, method='xml') 174 return html.tostring(xhtml_elt, method='xml')
168 175
169 d = deferToThread(blocking_cleaning, xhtml) 176 return deferToThread(blocking_cleaning, xhtml)
170 return d
171 177
172 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True): 178 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True):
173 """ Convert a text between two syntaxes 179 """ Convert a text between two syntaxes
174 @param text: text to convert 180 @param text: text to convert
175 @param syntax_from: source syntax (e.g. "markdown") 181 @param syntax_from: source syntax (e.g. "markdown")