comparison src/plugins/plugin_misc_text_syntaxes.py @ 692:e98db42cd78c

plugin text syntaxes: styles sanitisation
author Goffi <goffi@goffi.org>
date Tue, 12 Nov 2013 14:14:36 +0100
parents fb0b1100c908
children ddd2781bdd8c
comparison
equal deleted inserted replaced
691:481e0f8ae47c 692:e98db42cd78c
22 from wokkel import disco, pubsub 22 from wokkel import disco, pubsub
23 from twisted.internet import defer 23 from twisted.internet import defer
24 from twisted.internet.threads import deferToThread 24 from twisted.internet.threads import deferToThread
25 from lxml import html 25 from lxml import html
26 from lxml.html import clean 26 from lxml.html import clean
27 import re
27 28
28 29
29 CATEGORY = "Composition" 30 CATEGORY = "Composition"
30 NAME = "Syntax" 31 NAME = "Syntax"
31 _SYNTAX_XHTML = "XHTML" 32 _SYNTAX_XHTML = "XHTML"
32 33 # TODO: check/adapt following list
34 STYLES_WHITELIST = ["azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width"] # based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html)
35
36 SAFE_ATTRS = html.defs.safe_attrs.union(('style',))
37 STYLES_VALUES_REGEX = r'^(' + '|'.join(['([a-z-]+)', # alphabetical names
38 '(#[0-9a-f]+)', # hex value
39 '(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))', # values with units (or not)
40 'rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)', # rgb function
41 'rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)', # rgba function
42 ]) + ') *(!important)?$' # we accept "!important" at the end
43 STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX)
33 44
34 PLUGIN_INFO = { 45 PLUGIN_INFO = {
35 "name": "Text syntaxes", 46 "name": "Text syntaxes",
36 "import_name": "TEXT-SYNTAXES", 47 "import_name": "TEXT-SYNTAXES",
37 "type": "MISC", 48 "type": "MISC",
69 </params> 80 </params>
70 """ 81 """
71 82
72 params_data = { 83 params_data = {
73 'category_name': CATEGORY, 84 'category_name': CATEGORY,
74 'category_label': _(CATEGORY), #FIXME: gof: vérifier que gettext gère ça 85 'category_label': _(CATEGORY), #FIXME: check that gettext can handle this
75 'name': NAME, 86 'name': NAME,
76 'label': _(NAME), 87 'label': _(NAME),
77 'default': _SYNTAX_XHTML, 88 'default': _SYNTAX_XHTML,
78 'syntaxes': {}, 89 'syntaxes': {},
79 } 90 }
90 except ImportError: 101 except ImportError:
91 warning("markdown or html2text not found, can't use Markdown syntax") 102 warning("markdown or html2text not found, can't use Markdown syntax")
92 host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssb', out_sign='s', 103 host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssb', out_sign='s',
93 async=True, method=self.convert) 104 async=True, method=self.convert)
94 105
95 # TODO: gof: nettoyage XHTML avec lxml
96
97 def _updateParamOptions(self): 106 def _updateParamOptions(self):
98 data_synt = TextSyntaxes.params_data['syntaxes'] 107 data_synt = TextSyntaxes.params_data['syntaxes']
99 syntaxes = [] 108 syntaxes = []
100 109
101 for syntax in data_synt.keys(): 110 for syntax in data_synt.keys():
122 131
123 def clean_xhtml(self, xhtml): 132 def clean_xhtml(self, xhtml):
124 """ Clean XHTML text by removing potentially dangerous/malicious parts 133 """ Clean XHTML text by removing potentially dangerous/malicious parts
125 @param xhtml: raw xhtml text to clean 134 @param xhtml: raw xhtml text to clean
126 """ 135 """
127 # FIXME: styles are allowed but not cleaned, they have to be cleaned (whitelist ? cssutils ?) !
128 def blocking_cleaning(xhtml): 136 def blocking_cleaning(xhtml):
129 safe_attrs = html.defs.safe_attrs.union(('style',)) 137 """ Clean XHTML and style attributes """
138
139 def clean_style(styles_raw):
140 """" Remove styles not in the whitelist,
141 or where the value doesn't match the regex """
142 styles = styles_raw.split(";")
143 cleaned_styles = []
144 for style in styles:
145 try:
146 key, value = style.split(':')
147 except ValueError:
148 continue
149 key = key.lower().strip()
150 if key not in STYLES_WHITELIST:
151 continue
152 value = value.lower().strip()
153 if not STYLES_ACCEPTED_VALUE.match(value):
154 continue
155 if value == "none":
156 continue
157 cleaned_styles.append((key, value))
158 return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles])
159
130 xhtml_elt = html.fromstring(xhtml) 160 xhtml_elt = html.fromstring(xhtml)
131 cleaner = clean.Cleaner(style=False, 161 cleaner = clean.Cleaner(style=False,
132 add_nofollow=False, 162 add_nofollow=False,
133 safe_attrs=safe_attrs) 163 safe_attrs=SAFE_ATTRS)
134 return html.tostring(cleaner.clean_html(xhtml_elt), method='xml') 164 xhtml_elt = cleaner.clean_html(xhtml_elt)
165 for elt in xhtml_elt.xpath("//*[@style]"):
166 elt.set("style", clean_style(elt.get('style')))
167 return html.tostring(xhtml_elt, method='xml')
135 168
136 d = deferToThread(blocking_cleaning, xhtml) 169 d = deferToThread(blocking_cleaning, xhtml)
137 return d 170 return d
138 171
139 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True): 172 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True):