Mercurial > libervia-backend
comparison src/plugins/plugin_misc_text_syntaxes.py @ 692:e98db42cd78c
plugin text syntaxes: styles sanitisation
author | Goffi <goffi@goffi.org> |
---|---|
date | Tue, 12 Nov 2013 14:14:36 +0100 |
parents | fb0b1100c908 |
children | ddd2781bdd8c |
comparison
equal
deleted
inserted
replaced
691:481e0f8ae47c | 692:e98db42cd78c |
---|---|
22 from wokkel import disco, pubsub | 22 from wokkel import disco, pubsub |
23 from twisted.internet import defer | 23 from twisted.internet import defer |
24 from twisted.internet.threads import deferToThread | 24 from twisted.internet.threads import deferToThread |
25 from lxml import html | 25 from lxml import html |
26 from lxml.html import clean | 26 from lxml.html import clean |
27 import re | |
27 | 28 |
28 | 29 |
29 CATEGORY = "Composition" | 30 CATEGORY = "Composition" |
30 NAME = "Syntax" | 31 NAME = "Syntax" |
31 _SYNTAX_XHTML = "XHTML" | 32 _SYNTAX_XHTML = "XHTML" |
32 | 33 # TODO: check/adapt following list |
34 STYLES_WHITELIST = ["azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width"] # based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html) | |
35 | |
36 SAFE_ATTRS = html.defs.safe_attrs.union(('style',)) | |
37 STYLES_VALUES_REGEX = r'^(' + '|'.join(['([a-z-]+)', # alphabetical names | |
38 '(#[0-9a-f]+)', # hex value | |
39 '(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))', # values with units (or not) | |
40 'rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)', # rgb function | |
41 'rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)', # rgba function | |
42 ]) + ') *(!important)?$' # we accept "!important" at the end | |
43 STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX) | |
33 | 44 |
34 PLUGIN_INFO = { | 45 PLUGIN_INFO = { |
35 "name": "Text syntaxes", | 46 "name": "Text syntaxes", |
36 "import_name": "TEXT-SYNTAXES", | 47 "import_name": "TEXT-SYNTAXES", |
37 "type": "MISC", | 48 "type": "MISC", |
69 </params> | 80 </params> |
70 """ | 81 """ |
71 | 82 |
72 params_data = { | 83 params_data = { |
73 'category_name': CATEGORY, | 84 'category_name': CATEGORY, |
74 'category_label': _(CATEGORY), #FIXME: gof: vérifier que gettext gère ça | 85 'category_label': _(CATEGORY), #FIXME: check that gettext can handle this |
75 'name': NAME, | 86 'name': NAME, |
76 'label': _(NAME), | 87 'label': _(NAME), |
77 'default': _SYNTAX_XHTML, | 88 'default': _SYNTAX_XHTML, |
78 'syntaxes': {}, | 89 'syntaxes': {}, |
79 } | 90 } |
90 except ImportError: | 101 except ImportError: |
91 warning("markdown or html2text not found, can't use Markdown syntax") | 102 warning("markdown or html2text not found, can't use Markdown syntax") |
92 host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssb', out_sign='s', | 103 host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssb', out_sign='s', |
93 async=True, method=self.convert) | 104 async=True, method=self.convert) |
94 | 105 |
95 # TODO: gof: nettoyage XHTML avec lxml | |
96 | |
97 def _updateParamOptions(self): | 106 def _updateParamOptions(self): |
98 data_synt = TextSyntaxes.params_data['syntaxes'] | 107 data_synt = TextSyntaxes.params_data['syntaxes'] |
99 syntaxes = [] | 108 syntaxes = [] |
100 | 109 |
101 for syntax in data_synt.keys(): | 110 for syntax in data_synt.keys(): |
122 | 131 |
123 def clean_xhtml(self, xhtml): | 132 def clean_xhtml(self, xhtml): |
124 """ Clean XHTML text by removing potentially dangerous/malicious parts | 133 """ Clean XHTML text by removing potentially dangerous/malicious parts |
125 @param xhtml: raw xhtml text to clean | 134 @param xhtml: raw xhtml text to clean |
126 """ | 135 """ |
127 # FIXME: styles are allowed but not cleaned, they have to be cleaned (whitelist ? cssutils ?) ! | |
128 def blocking_cleaning(xhtml): | 136 def blocking_cleaning(xhtml): |
129 safe_attrs = html.defs.safe_attrs.union(('style',)) | 137 """ Clean XHTML and style attributes """ |
138 | |
139 def clean_style(styles_raw): | |
140 """" Remove styles not in the whitelist, | |
141 or where the value doesn't match the regex """ | |
142 styles = styles_raw.split(";") | |
143 cleaned_styles = [] | |
144 for style in styles: | |
145 try: | |
146 key, value = style.split(':') | |
147 except ValueError: | |
148 continue | |
149 key = key.lower().strip() | |
150 if key not in STYLES_WHITELIST: | |
151 continue | |
152 value = value.lower().strip() | |
153 if not STYLES_ACCEPTED_VALUE.match(value): | |
154 continue | |
155 if value == "none": | |
156 continue | |
157 cleaned_styles.append((key, value)) | |
158 return "; ".join(["%s: %s" % (key, value) for key, value in cleaned_styles]) | |
159 | |
130 xhtml_elt = html.fromstring(xhtml) | 160 xhtml_elt = html.fromstring(xhtml) |
131 cleaner = clean.Cleaner(style=False, | 161 cleaner = clean.Cleaner(style=False, |
132 add_nofollow=False, | 162 add_nofollow=False, |
133 safe_attrs=safe_attrs) | 163 safe_attrs=SAFE_ATTRS) |
134 return html.tostring(cleaner.clean_html(xhtml_elt), method='xml') | 164 xhtml_elt = cleaner.clean_html(xhtml_elt) |
165 for elt in xhtml_elt.xpath("//*[@style]"): | |
166 elt.set("style", clean_style(elt.get('style'))) | |
167 return html.tostring(xhtml_elt, method='xml') | |
135 | 168 |
136 d = deferToThread(blocking_cleaning, xhtml) | 169 d = deferToThread(blocking_cleaning, xhtml) |
137 return d | 170 return d |
138 | 171 |
139 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True): | 172 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True): |