Mercurial > libervia-backend
diff src/plugins/plugin_misc_text_syntaxes.py @ 692:e98db42cd78c
plugin text syntaxes: styles sanitisation
author | Goffi <goffi@goffi.org> |
---|---|
date | Tue, 12 Nov 2013 14:14:36 +0100 |
parents | fb0b1100c908 |
children | ddd2781bdd8c |
line wrap: on
line diff
--- a/src/plugins/plugin_misc_text_syntaxes.py Sat Nov 09 09:12:38 2013 +0100 +++ b/src/plugins/plugin_misc_text_syntaxes.py Tue Nov 12 14:14:36 2013 +0100 @@ -24,12 +24,23 @@ from twisted.internet.threads import deferToThread from lxml import html from lxml.html import clean +import re CATEGORY = "Composition" NAME = "Syntax" _SYNTAX_XHTML = "XHTML" +# TODO: check/adapt following list +STYLES_WHITELIST = ["azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width"] # based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html) +SAFE_ATTRS = html.defs.safe_attrs.union(('style',)) +STYLES_VALUES_REGEX = r'^(' + '|'.join(['([a-z-]+)', # alphabetical names + '(#[0-9a-f]+)', # hex value + '(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))', # values with units (or not) + 'rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)', # rgb function + 'rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)', # rgba function + ]) + ') *(!important)?$' # we accept "!important" at the end +STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX) PLUGIN_INFO = { "name": "Text syntaxes", @@ -71,7 +82,7 @@ params_data = { 'category_name': CATEGORY, - 'category_label': _(CATEGORY), #FIXME: gof: vérifier que gettext gère ça + 'category_label': _(CATEGORY), #FIXME: check that gettext can handle this 'name': NAME, 'label': _(NAME), 'default': _SYNTAX_XHTML, @@ -92,8 +103,6 @@ host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssb', out_sign='s', async=True, method=self.convert) - # TODO: gof: nettoyage XHTML avec lxml - def _updateParamOptions(self): data_synt = TextSyntaxes.params_data['syntaxes'] syntaxes = [] @@ -124,14 +133,38 @@ """ Clean XHTML text by removing potentially dangerous/malicious parts @param xhtml: raw xhtml text to clean """ - # FIXME: styles are allowed but not cleaned, they have to be cleaned (whitelist ? cssutils ?) ! def blocking_cleaning(xhtml): - safe_attrs = html.defs.safe_attrs.union(('style',)) + """ Clean XHTML and style attributes """ + + def clean_style(styles_raw): + """" Remove styles not in the whitelist, + or where the value doesn't match the regex """ + styles = styles_raw.split(";") + cleaned_styles = [] + for style in styles: + try: + key, value = style.split(':') + except ValueError: + continue + key = key.lower().strip() + if key not in STYLES_WHITELIST: + continue + value = value.lower().strip() + if not STYLES_ACCEPTED_VALUE.match(value): + continue + if value == "none": + continue + cleaned_styles.append((key, value)) + return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles]) + xhtml_elt = html.fromstring(xhtml) cleaner = clean.Cleaner(style=False, add_nofollow=False, - safe_attrs=safe_attrs) - return html.tostring(cleaner.clean_html(xhtml_elt), method='xml') + safe_attrs=SAFE_ATTRS) + xhtml_elt = cleaner.clean_html(xhtml_elt) + for elt in xhtml_elt.xpath("//*[@style]"): + elt.set("style", clean_style(elt.get('style'))) + return html.tostring(xhtml_elt, method='xml') d = deferToThread(blocking_cleaning, xhtml) return d