libervia-backend: src/plugins/plugin_misc_text

comparison src/plugins/plugin_misc_text_syntaxes.py @ 692:e98db42cd78c

plugin text syntaxes: styles sanitisation

author	Goffi <goffi@goffi.org>
date	Tue, 12 Nov 2013 14:14:36 +0100
parents	fb0b1100c908
children	ddd2781bdd8c

comparison

equal deleted inserted replaced

-:481e0f8ae47c
+:e98db42cd78c
 from wokkel import disco, pubsub
 from twisted.internet import defer
 from twisted.internet.threads import deferToThread
 from lxml import html
 from lxml.html import clean
+import re
 CATEGORY = "Composition"
 NAME = "Syntax"
 _SYNTAX_XHTML = "XHTML"
+# TODO: check/adapt following list
+STYLES_WHITELIST = ["azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width"] # based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html)
+SAFE_ATTRS = html.defs.safe_attrs.union(('style',))
+STYLES_VALUES_REGEX = r'^(' + '|'.join(['([a-z-]+)', # alphabetical names
+'(#[0-9a-f]+)', # hex value
+'(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))', # values with units (or not)
+'rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)', # rgb function
+'rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)', # rgba function
+]) + ') *(!important)?$' # we accept "!important" at the end
+STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX)
 PLUGIN_INFO = {
 "name": "Text syntaxes",
 "import_name": "TEXT-SYNTAXES",
 "type": "MISC",
 </params>
 """
 params_data = {
 'category_name': CATEGORY,
-'category_label': _(CATEGORY), #FIXME: gof:  vérifier que gettext gère ça
+'category_label': _(CATEGORY), #FIXME: check that gettext can handle this
 'name': NAME,
 'label': _(NAME),
 'default': _SYNTAX_XHTML,
 'syntaxes': {},
 }
 except ImportError:
 warning("markdown or html2text not found, can't use Markdown syntax")
 host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssb', out_sign='s',
 async=True, method=self.convert)
-# TODO: gof: nettoyage XHTML avec lxml
 def _updateParamOptions(self):
 data_synt = TextSyntaxes.params_data['syntaxes']
 syntaxes = []
 for syntax in data_synt.keys():
 def clean_xhtml(self, xhtml):
 """ Clean XHTML text by removing potentially dangerous/malicious parts
 @param xhtml: raw xhtml text to clean
 """
-# FIXME: styles are allowed but not cleaned, they have to be cleaned (whitelist ? cssutils ?) !
 def blocking_cleaning(xhtml):
-safe_attrs = html.defs.safe_attrs.union(('style',))
+""" Clean XHTML and style attributes """
+def clean_style(styles_raw):
+"""" Remove styles not in the whitelist,
+or where the value doesn't match the regex """
+styles = styles_raw.split(";")
+cleaned_styles = []
+for style in styles:
+try:
+key, value = style.split(':')
+except ValueError:
+continue
+key = key.lower().strip()
+if key not in STYLES_WHITELIST:
+continue
+value = value.lower().strip()
+if not STYLES_ACCEPTED_VALUE.match(value):
+continue
+if value == "none":
+continue
+cleaned_styles.append((key, value))
+return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles])
 xhtml_elt = html.fromstring(xhtml)
 cleaner = clean.Cleaner(style=False,
 add_nofollow=False,
-safe_attrs=safe_attrs)
+safe_attrs=SAFE_ATTRS)
-return html.tostring(cleaner.clean_html(xhtml_elt), method='xml')
+xhtml_elt = cleaner.clean_html(xhtml_elt)
+for elt in xhtml_elt.xpath("//*[@style]"):
+elt.set("style", clean_style(elt.get('style')))
+return html.tostring(xhtml_elt, method='xml')
 d = deferToThread(blocking_cleaning, xhtml)
 return d
 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True):

Mercurial > libervia-backend

comparison src/plugins/plugin_misc_text_syntaxes.py @ 692:e98db42cd78c