diff src/plugins/plugin_misc_text_syntaxes.py @ 692:e98db42cd78c

plugin text syntaxes: styles sanitisation
author Goffi <goffi@goffi.org>
date Tue, 12 Nov 2013 14:14:36 +0100
parents fb0b1100c908
children ddd2781bdd8c
line wrap: on
line diff
--- a/src/plugins/plugin_misc_text_syntaxes.py	Sat Nov 09 09:12:38 2013 +0100
+++ b/src/plugins/plugin_misc_text_syntaxes.py	Tue Nov 12 14:14:36 2013 +0100
@@ -24,12 +24,23 @@
 from twisted.internet.threads import deferToThread
 from lxml import html
 from lxml.html import clean
+import re
 
 
 CATEGORY = "Composition"
 NAME = "Syntax"
 _SYNTAX_XHTML = "XHTML"
+# TODO: check/adapt following list
+STYLES_WHITELIST = ["azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width"] # based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html)
 
+SAFE_ATTRS = html.defs.safe_attrs.union(('style',))
+STYLES_VALUES_REGEX = r'^(' + '|'.join(['([a-z-]+)', # alphabetical names
+                                       '(#[0-9a-f]+)', # hex value
+                                       '(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))', # values with units (or not)
+                                       'rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)', # rgb function
+                                       'rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)', # rgba function
+                                      ]) + ') *(!important)?$' # we accept "!important" at the end
+STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX)
 
 PLUGIN_INFO = {
     "name": "Text syntaxes",
@@ -71,7 +82,7 @@
 
     params_data = {
         'category_name': CATEGORY,
-        'category_label': _(CATEGORY), #FIXME: gof:  vérifier que gettext gère ça
+        'category_label': _(CATEGORY), #FIXME: check that gettext can handle this
         'name': NAME,
         'label': _(NAME),
         'default': _SYNTAX_XHTML,
@@ -92,8 +103,6 @@
         host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssb', out_sign='s',
                               async=True, method=self.convert)
 
-        # TODO: gof: nettoyage XHTML avec lxml
-
     def _updateParamOptions(self):
         data_synt = TextSyntaxes.params_data['syntaxes']
         syntaxes = []
@@ -124,14 +133,38 @@
         """ Clean XHTML text by removing potentially dangerous/malicious parts
         @param xhtml: raw xhtml text to clean
         """
-        # FIXME: styles are allowed but not cleaned, they have to be cleaned (whitelist ? cssutils ?) !
         def blocking_cleaning(xhtml):
-            safe_attrs = html.defs.safe_attrs.union(('style',))
+            """ Clean XHTML and style attributes """
+
+            def clean_style(styles_raw):
+                """" Remove styles not in the whitelist,
+                or where the value doesn't match the regex """
+                styles = styles_raw.split(";")
+                cleaned_styles = []
+                for style in styles:
+                    try:
+                        key, value = style.split(':')
+                    except ValueError:
+                        continue
+                    key = key.lower().strip()
+                    if key not in STYLES_WHITELIST:
+                        continue
+                    value = value.lower().strip()
+                    if not STYLES_ACCEPTED_VALUE.match(value):
+                        continue
+                    if value == "none":
+                        continue
+                    cleaned_styles.append((key, value))
+                return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles])
+
             xhtml_elt = html.fromstring(xhtml)
             cleaner = clean.Cleaner(style=False,
                                     add_nofollow=False,
-                                    safe_attrs=safe_attrs)
-            return html.tostring(cleaner.clean_html(xhtml_elt), method='xml')
+                                    safe_attrs=SAFE_ATTRS)
+            xhtml_elt = cleaner.clean_html(xhtml_elt)
+            for elt in xhtml_elt.xpath("//*[@style]"):
+                elt.set("style", clean_style(elt.get('style')))
+            return html.tostring(xhtml_elt, method='xml')
 
         d = deferToThread(blocking_cleaning, xhtml)
         return d