Mercurial > libervia-backend
comparison sat/plugins/plugin_misc_text_syntaxes.py @ 2562:26edcf3a30eb
core, setup: huge cleaning:
- moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention
- move twisted directory to root
- removed all hacks from setup.py, and added missing dependencies, it is now clean
- use https URL for website in setup.py
- removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed
- renamed sat.sh to sat and fixed its installation
- added python_requires to specify Python version needed
- replaced glib2reactor which use deprecated code by gtk3reactor
sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author | Goffi <goffi@goffi.org> |
---|---|
date | Mon, 02 Apr 2018 19:44:50 +0200 |
parents | src/plugins/plugin_misc_text_syntaxes.py@0046283a285d |
children | 56f94936df1e |
comparison
equal
deleted
inserted
replaced
2561:bd30dc3ffe5a | 2562:26edcf3a30eb |
---|---|
1 #!/usr/bin/env python2 | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # SAT plugin for managing various text syntaxes | |
5 # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org) | |
6 | |
7 # This program is free software: you can redistribute it and/or modify | |
8 # it under the terms of the GNU Affero General Public License as published by | |
9 # the Free Software Foundation, either version 3 of the License, or | |
10 # (at your option) any later version. | |
11 | |
12 # This program is distributed in the hope that it will be useful, | |
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 # GNU Affero General Public License for more details. | |
16 | |
17 # You should have received a copy of the GNU Affero General Public License | |
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | |
20 from sat.core.i18n import _, D_ | |
21 from sat.core.constants import Const as C | |
22 from sat.core.log import getLogger | |
23 log = getLogger(__name__) | |
24 | |
25 from twisted.internet import defer | |
26 from twisted.internet.threads import deferToThread | |
27 from sat.core import exceptions | |
28 try: | |
29 from lxml import html | |
30 from lxml.html import clean | |
31 except ImportError: | |
32 raise exceptions.MissingModule(u"Missing module lxml, please download/install it from http://lxml.de/") | |
33 from cgi import escape | |
34 import re | |
35 | |
36 | |
37 CATEGORY = D_("Composition") | |
38 NAME = "Syntax" | |
39 _SYNTAX_XHTML = "XHTML" | |
40 _SYNTAX_CURRENT = "@CURRENT@" | |
41 | |
42 # TODO: check/adapt following list | |
43 # list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html) | |
44 STYLES_WHITELIST = ("azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width") | |
45 | |
46 SAFE_ATTRS = html.defs.safe_attrs.union(('style', 'poster', 'controls')) | |
47 STYLES_VALUES_REGEX = r'^(' + '|'.join(['([a-z-]+)', # alphabetical names | |
48 '(#[0-9a-f]+)', # hex value | |
49 '(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))', # values with units (or not) | |
50 'rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)', # rgb function | |
51 'rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)', # rgba function | |
52 ]) + ') *(!important)?$' # we accept "!important" at the end | |
53 STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX) | |
54 | |
55 PLUGIN_INFO = { | |
56 C.PI_NAME: "Text syntaxes", | |
57 C.PI_IMPORT_NAME: "TEXT-SYNTAXES", | |
58 C.PI_TYPE: "MISC", | |
59 C.PI_PROTOCOLS: [], | |
60 C.PI_DEPENDENCIES: [], | |
61 C.PI_MAIN: "TextSyntaxes", | |
62 C.PI_HANDLER: "no", | |
63 C.PI_DESCRIPTION: _("""Management of various text syntaxes (XHTML-IM, Markdown, etc)""") | |
64 } | |
65 | |
66 | |
67 class TextSyntaxes(object): | |
68 """ Text conversion class | |
69 XHTML utf-8 is used as intermediate language for conversions | |
70 """ | |
71 | |
72 OPT_DEFAULT = "DEFAULT" | |
73 OPT_HIDDEN = "HIDDEN" | |
74 OPT_NO_THREAD = "NO_THREAD" | |
75 SYNTAX_XHTML = _SYNTAX_XHTML | |
76 SYNTAX_MARKDOWN = "markdown" | |
77 SYNTAX_TEXT = "text" | |
78 syntaxes = {} | |
79 default_syntax = SYNTAX_XHTML | |
80 | |
81 params = """ | |
82 <params> | |
83 <individual> | |
84 <category name="%(category_name)s" label="%(category_label)s"> | |
85 <param name="%(name)s" label="%(label)s" type="list" security="0"> | |
86 %(options)s | |
87 </param> | |
88 </category> | |
89 </individual> | |
90 </params> | |
91 """ | |
92 | |
93 params_data = { | |
94 'category_name': CATEGORY, | |
95 'category_label': _(CATEGORY), | |
96 'name': NAME, | |
97 'label': _(NAME), | |
98 'syntaxes': syntaxes, | |
99 } | |
100 | |
101 def __init__(self, host): | |
102 log.info(_("Text syntaxes plugin initialization")) | |
103 self.host = host | |
104 self.addSyntax(self.SYNTAX_XHTML, lambda xhtml: defer.succeed(xhtml), lambda xhtml: defer.succeed(xhtml), | |
105 TextSyntaxes.OPT_NO_THREAD) | |
106 # TODO: text => XHTML should add <a/> to url like in frontends | |
107 # it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar | |
108 self.addSyntax(self.SYNTAX_TEXT, lambda text: escape(text), lambda xhtml: self._removeMarkups(xhtml), [TextSyntaxes.OPT_HIDDEN]) | |
109 try: | |
110 import markdown, html2text | |
111 | |
112 def _html2text(html, baseurl=''): | |
113 h = html2text.HTML2Text(baseurl=baseurl) | |
114 h.body_width = 0 # do not truncate the lines, it breaks the long URLs | |
115 return h.handle(html) | |
116 self.addSyntax(self.SYNTAX_MARKDOWN, markdown.markdown, _html2text, [TextSyntaxes.OPT_DEFAULT]) | |
117 except ImportError: | |
118 log.warning(u"markdown or html2text not found, can't use Markdown syntax") | |
119 log.info(u"You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/") | |
120 host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssbs', out_sign='s', | |
121 async=True, method=self.convert) | |
122 host.bridge.addMethod("syntaxGet", ".plugin", in_sign='s', out_sign='s', | |
123 method=self.getSyntax) | |
124 | |
125 def _updateParamOptions(self): | |
126 data_synt = TextSyntaxes.syntaxes | |
127 default_synt = TextSyntaxes.default_syntax | |
128 syntaxes = [] | |
129 | |
130 for syntax in data_synt.keys(): | |
131 flags = data_synt[syntax]["flags"] | |
132 if TextSyntaxes.OPT_HIDDEN not in flags: | |
133 syntaxes.append(syntax) | |
134 | |
135 syntaxes.sort(key=lambda synt: synt.lower()) | |
136 options = [] | |
137 | |
138 for syntax in syntaxes: | |
139 selected = 'selected="true"' if syntax == default_synt else '' | |
140 options.append(u'<option value="%s" %s/>' % (syntax, selected)) | |
141 | |
142 TextSyntaxes.params_data["options"] = u'\n'.join(options) | |
143 self.host.memory.updateParams(TextSyntaxes.params % TextSyntaxes.params_data) | |
144 | |
145 def getCurrentSyntax(self, profile): | |
146 """ Return the selected syntax for the given profile | |
147 | |
148 @param profile: %(doc_profile)s | |
149 @return: profile selected syntax | |
150 """ | |
151 return self.host.memory.getParamA(NAME, CATEGORY , profile_key=profile) | |
152 | |
153 def _logError(self, failure, action=u"converting syntax"): | |
154 log.error(u"Error while {action}: {failure}".format(action=action, failure=failure)) | |
155 return failure | |
156 | |
157 def cleanXHTML(self, xhtml): | |
158 """ Clean XHTML text by removing potentially dangerous/malicious parts | |
159 @param xhtml: raw xhtml text to clean (or lxml's HtmlElement) | |
160 """ | |
161 def blocking_cleaning(xhtml): | |
162 """ Clean XHTML and style attributes """ | |
163 | |
164 def clean_style(styles_raw): | |
165 """" Remove styles not in the whitelist, | |
166 or where the value doesn't match the regex """ | |
167 styles = styles_raw.split(";") | |
168 cleaned_styles = [] | |
169 for style in styles: | |
170 try: | |
171 key, value = style.split(':') | |
172 except ValueError: | |
173 continue | |
174 key = key.lower().strip() | |
175 if key not in STYLES_WHITELIST: | |
176 continue | |
177 value = value.lower().strip() | |
178 if not STYLES_ACCEPTED_VALUE.match(value): | |
179 continue | |
180 if value == "none": | |
181 continue | |
182 cleaned_styles.append((key, value)) | |
183 return "; ".join(["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]) | |
184 | |
185 if isinstance(xhtml, basestring): | |
186 xhtml_elt = html.fromstring(xhtml) | |
187 elif isinstance(xhtml, html.HtmlElement): | |
188 xhtml_elt = xhtml | |
189 else: | |
190 log.error("Only strings and HtmlElements can be cleaned") | |
191 raise exceptions.DataError | |
192 cleaner = clean.Cleaner(style=False, | |
193 add_nofollow=False, | |
194 safe_attrs=SAFE_ATTRS) | |
195 xhtml_elt = cleaner.clean_html(xhtml_elt) | |
196 for elt in xhtml_elt.xpath("//*[@style]"): | |
197 elt.set("style", clean_style(elt.get('style'))) | |
198 return html.tostring(xhtml_elt, encoding=unicode, method='xml') | |
199 | |
200 d = deferToThread(blocking_cleaning, xhtml) | |
201 d.addErrback(self._logError, action=u"cleaning syntax") | |
202 return d | |
203 | |
204 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None): | |
205 """Convert a text between two syntaxes | |
206 | |
207 @param text: text to convert | |
208 @param syntax_from: source syntax (e.g. "markdown") | |
209 @param syntax_to: dest syntax (e.g.: "XHTML") | |
210 @param safe: clean resulting XHTML to avoid malicious code if True | |
211 @param profile: needed only when syntax_from or syntax_to is set to _SYNTAX_CURRENT | |
212 @return(unicode): converted text | |
213 """ | |
214 # FIXME: convert should be abled to handle domish.Element directly | |
215 # when dealing with XHTML | |
216 # TODO: a way for parser to return parsing errors/warnings | |
217 | |
218 if syntax_from == _SYNTAX_CURRENT: | |
219 syntax_from = self.getCurrentSyntax(profile) | |
220 else: | |
221 syntax_from = syntax_from.lower().strip() | |
222 if syntax_to == _SYNTAX_CURRENT: | |
223 syntax_to = self.getCurrentSyntax(profile) | |
224 else: | |
225 syntax_to = syntax_to.lower().strip() | |
226 syntaxes = TextSyntaxes.syntaxes | |
227 if syntax_from not in syntaxes: | |
228 raise exceptions.NotFound(syntax_from) | |
229 if syntax_to not in syntaxes: | |
230 raise exceptions.NotFound(syntax_to) | |
231 d = None | |
232 | |
233 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]: | |
234 d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text) | |
235 else: | |
236 d = deferToThread(syntaxes[syntax_from]["to"], text) | |
237 | |
238 #TODO: keep only body element and change it to a div here ? | |
239 | |
240 if safe: | |
241 d.addCallback(self.cleanXHTML) | |
242 | |
243 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]: | |
244 d.addCallback(syntaxes[syntax_to]["from"]) | |
245 else: | |
246 d.addCallback(lambda xhtml: deferToThread(syntaxes[syntax_to]["from"], xhtml)) | |
247 | |
248 # converters can add new lines that disturb the microblog change detection | |
249 d.addCallback(lambda text: text.rstrip()) | |
250 return d | |
251 | |
252 def addSyntax(self, name, to_xhtml_cb, from_xhtml_cb, flags = None): | |
253 """Add a new syntax to the manager | |
254 | |
255 @param name: unique name of the syntax | |
256 @param to_xhtml_cb: callback to convert from syntax to XHTML | |
257 @param from_xhtml_cb: callback to convert from XHTML to syntax | |
258 @param flags: set of optional flags, can be: | |
259 TextSyntaxes.OPT_DEFAULT: use as the default syntax (replace former one) | |
260 TextSyntaxes.OPT_HIDDEN: do not show in parameters | |
261 TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred) | |
262 """ | |
263 flags = flags if flags is not None else [] | |
264 if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags: | |
265 raise ValueError(u"{} and {} are mutually exclusive".format(TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT)) | |
266 | |
267 syntaxes = TextSyntaxes.syntaxes | |
268 key = name.lower().strip() | |
269 if key in syntaxes: | |
270 raise exceptions.ConflictError(u"This syntax key already exists: {}".format(key)) | |
271 syntaxes[key] = {"name": name, "to": to_xhtml_cb, "from": from_xhtml_cb, "flags": flags} | |
272 if TextSyntaxes.OPT_DEFAULT in flags: | |
273 TextSyntaxes.default_syntaxe = key | |
274 | |
275 self._updateParamOptions() | |
276 | |
277 def getSyntax(self, name): | |
278 """get syntax key corresponding to a name | |
279 | |
280 @raise exceptions.NotFound: syntax doesn't exist | |
281 """ | |
282 key = name.lower().strip() | |
283 if key in self.syntaxes: | |
284 return key | |
285 raise exceptions.NotFound | |
286 | |
287 def _removeMarkups(self, xhtml): | |
288 """Remove XHTML markups from the given string. | |
289 | |
290 @param xhtml: the XHTML string to be cleaned | |
291 @return: the cleaned string | |
292 """ | |
293 cleaner = clean.Cleaner(kill_tags=['style']) | |
294 cleaned = cleaner.clean_html(html.fromstring(xhtml)) | |
295 return html.tostring(cleaned, encoding=unicode, method="text") |