Mercurial > libervia-backend
comparison libervia/backend/plugins/plugin_misc_text_syntaxes.py @ 4071:4b842c1fb686
refactoring: renamed `sat` package to `libervia.backend`
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 02 Jun 2023 11:49:51 +0200 |
parents | sat/plugins/plugin_misc_text_syntaxes.py@00dbc3370d35 |
children | 26b7ed2817da |
comparison
equal
deleted
inserted
replaced
4070:d10748475025 | 4071:4b842c1fb686 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 | |
4 # SAT plugin for managing various text syntaxes | |
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) | |
6 | |
7 # This program is free software: you can redistribute it and/or modify | |
8 # it under the terms of the GNU Affero General Public License as published by | |
9 # the Free Software Foundation, either version 3 of the License, or | |
10 # (at your option) any later version. | |
11 | |
12 # This program is distributed in the hope that it will be useful, | |
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 # GNU Affero General Public License for more details. | |
16 | |
17 # You should have received a copy of the GNU Affero General Public License | |
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | |
20 from functools import partial | |
21 from html import escape | |
22 import re | |
23 from typing import Set | |
24 | |
25 from twisted.internet import defer | |
26 from twisted.internet.threads import deferToThread | |
27 | |
28 from libervia.backend.core import exceptions | |
29 from libervia.backend.core.constants import Const as C | |
30 from libervia.backend.core.i18n import D_, _ | |
31 from libervia.backend.core.log import getLogger | |
32 from libervia.backend.tools import xml_tools | |
33 | |
34 try: | |
35 from lxml import html | |
36 from lxml.html import clean | |
37 from lxml import etree | |
38 except ImportError: | |
39 raise exceptions.MissingModule( | |
40 "Missing module lxml, please download/install it from http://lxml.de/" | |
41 ) | |
42 | |
43 log = getLogger(__name__) | |
44 | |
45 CATEGORY = D_("Composition") | |
46 NAME = "Syntax" | |
47 _SYNTAX_XHTML = "xhtml" # must be lower case | |
48 _SYNTAX_CURRENT = "@CURRENT@" | |
49 | |
50 # TODO: check/adapt following list | |
51 # list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html) | |
52 STYLES_WHITELIST = ( | |
53 "azimuth", | |
54 "background-color", | |
55 "border-bottom-color", | |
56 "border-collapse", | |
57 "border-color", | |
58 "border-left-color", | |
59 "border-right-color", | |
60 "border-top-color", | |
61 "clear", | |
62 "color", | |
63 "cursor", | |
64 "direction", | |
65 "display", | |
66 "elevation", | |
67 "float", | |
68 "font", | |
69 "font-family", | |
70 "font-size", | |
71 "font-style", | |
72 "font-variant", | |
73 "font-weight", | |
74 "height", | |
75 "letter-spacing", | |
76 "line-height", | |
77 "overflow", | |
78 "pause", | |
79 "pause-after", | |
80 "pause-before", | |
81 "pitch", | |
82 "pitch-range", | |
83 "richness", | |
84 "speak", | |
85 "speak-header", | |
86 "speak-numeral", | |
87 "speak-punctuation", | |
88 "speech-rate", | |
89 "stress", | |
90 "text-align", | |
91 "text-decoration", | |
92 "text-indent", | |
93 "unicode-bidi", | |
94 "vertical-align", | |
95 "voice-family", | |
96 "volume", | |
97 "white-space", | |
98 "width", | |
99 ) | |
100 | |
101 # cf. https://www.w3.org/TR/html/syntax.html#void-elements | |
102 VOID_ELEMENTS = ( | |
103 "area", | |
104 "base", | |
105 "br", | |
106 "col", | |
107 "embed", | |
108 "hr", | |
109 "img", | |
110 "input", | |
111 "keygen", | |
112 "link", | |
113 "menuitem", | |
114 "meta", | |
115 "param", | |
116 "source", | |
117 "track", | |
118 "wbr") | |
119 | |
120 SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"} | |
121 SAFE_CLASSES = { | |
122 # those classes are used for code highlighting | |
123 "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh", | |
124 "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd", | |
125 "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd", | |
126 "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc", | |
127 "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write", | |
128 } | |
129 STYLES_VALUES_REGEX = ( | |
130 r"^(" | |
131 + "|".join( | |
132 [ | |
133 "([a-z-]+)", # alphabetical names | |
134 "(#[0-9a-f]+)", # hex value | |
135 "(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))", # values with units (or not) | |
136 "rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)", # rgb function | |
137 "rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)", # rgba function | |
138 ] | |
139 ) | |
140 + ") *(!important)?$" | |
141 ) # we accept "!important" at the end | |
142 STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX) | |
143 | |
144 PLUGIN_INFO = { | |
145 C.PI_NAME: "Text syntaxes", | |
146 C.PI_IMPORT_NAME: "TEXT_SYNTAXES", | |
147 C.PI_TYPE: "MISC", | |
148 C.PI_MODES: C.PLUG_MODE_BOTH, | |
149 C.PI_PROTOCOLS: [], | |
150 C.PI_DEPENDENCIES: [], | |
151 C.PI_MAIN: "TextSyntaxes", | |
152 C.PI_HANDLER: "no", | |
153 C.PI_DESCRIPTION: _( | |
154 """Management of various text syntaxes (XHTML-IM, Markdown, etc)""" | |
155 ), | |
156 } | |
157 | |
158 | |
159 class TextSyntaxes(object): | |
160 """ Text conversion class | |
161 XHTML utf-8 is used as intermediate language for conversions | |
162 """ | |
163 | |
164 OPT_DEFAULT = "DEFAULT" | |
165 OPT_HIDDEN = "HIDDEN" | |
166 OPT_NO_THREAD = "NO_THREAD" | |
167 SYNTAX_XHTML = _SYNTAX_XHTML | |
168 SYNTAX_MARKDOWN = "markdown" | |
169 SYNTAX_TEXT = "text" | |
170 # default_syntax must be lower case | |
171 default_syntax = SYNTAX_XHTML | |
172 | |
173 | |
174 def __init__(self, host): | |
175 log.info(_("Text syntaxes plugin initialization")) | |
176 self.host = host | |
177 self.syntaxes = {} | |
178 | |
179 self.params = """ | |
180 <params> | |
181 <individual> | |
182 <category name="%(category_name)s" label="%(category_label)s"> | |
183 <param name="%(name)s" label="%(label)s" type="list" security="0"> | |
184 %(options)s | |
185 </param> | |
186 </category> | |
187 </individual> | |
188 </params> | |
189 """ | |
190 | |
191 self.params_data = { | |
192 "category_name": CATEGORY, | |
193 "category_label": _(CATEGORY), | |
194 "name": NAME, | |
195 "label": _(NAME), | |
196 "syntaxes": self.syntaxes, | |
197 } | |
198 | |
199 self.add_syntax( | |
200 self.SYNTAX_XHTML, | |
201 lambda xhtml: defer.succeed(xhtml), | |
202 lambda xhtml: defer.succeed(xhtml), | |
203 TextSyntaxes.OPT_NO_THREAD, | |
204 ) | |
205 # TODO: text => XHTML should add <a/> to url like in frontends | |
206 # it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar | |
207 self.add_syntax( | |
208 self.SYNTAX_TEXT, | |
209 lambda text: escape(text), | |
210 lambda xhtml: self._remove_markups(xhtml), | |
211 [TextSyntaxes.OPT_HIDDEN], | |
212 ) | |
213 try: | |
214 import markdown, html2text | |
215 from markdown.extensions import Extension | |
216 | |
217 # XXX: we disable raw HTML parsing by default, to avoid parsing error | |
218 # when the user is not aware of markdown and HTML | |
219 class EscapeHTML(Extension): | |
220 def extendMarkdown(self, md): | |
221 md.preprocessors.deregister('html_block') | |
222 md.inlinePatterns.deregister('html') | |
223 | |
224 def _html2text(html, baseurl=""): | |
225 h = html2text.HTML2Text(baseurl=baseurl) | |
226 h.body_width = 0 # do not truncate the lines, it breaks the long URLs | |
227 return h.handle(html) | |
228 | |
229 self.add_syntax( | |
230 self.SYNTAX_MARKDOWN, | |
231 partial(markdown.markdown, | |
232 extensions=[ | |
233 EscapeHTML(), | |
234 'nl2br', | |
235 'codehilite', | |
236 'fenced_code', | |
237 'sane_lists', | |
238 'tables', | |
239 ], | |
240 extension_configs = { | |
241 "codehilite": { | |
242 "css_class": "highlight", | |
243 } | |
244 }), | |
245 _html2text, | |
246 [TextSyntaxes.OPT_DEFAULT], | |
247 ) | |
248 except ImportError: | |
249 log.warning("markdown or html2text not found, can't use Markdown syntax") | |
250 log.info( | |
251 "You can download/install them from https://pythonhosted.org/Markdown/ " | |
252 "and https://github.com/Alir3z4/html2text/" | |
253 ) | |
254 host.bridge.add_method( | |
255 "syntax_convert", | |
256 ".plugin", | |
257 in_sign="sssbs", | |
258 out_sign="s", | |
259 async_=True, | |
260 method=self.convert, | |
261 ) | |
262 host.bridge.add_method( | |
263 "syntax_get", ".plugin", in_sign="s", out_sign="s", method=self.get_syntax | |
264 ) | |
265 if xml_tools.clean_xhtml is None: | |
266 log.debug("Installing cleaning method") | |
267 xml_tools.clean_xhtml = self.clean_xhtml | |
268 | |
269 def _update_param_options(self): | |
270 data_synt = self.syntaxes | |
271 default_synt = TextSyntaxes.default_syntax | |
272 syntaxes = [] | |
273 | |
274 for syntax in list(data_synt.keys()): | |
275 flags = data_synt[syntax]["flags"] | |
276 if TextSyntaxes.OPT_HIDDEN not in flags: | |
277 syntaxes.append(syntax) | |
278 | |
279 syntaxes.sort(key=lambda synt: synt.lower()) | |
280 options = [] | |
281 | |
282 for syntax in syntaxes: | |
283 selected = 'selected="true"' if syntax == default_synt else "" | |
284 options.append('<option value="%s" %s/>' % (syntax, selected)) | |
285 | |
286 self.params_data["options"] = "\n".join(options) | |
287 self.host.memory.update_params(self.params % self.params_data) | |
288 | |
289 def get_current_syntax(self, profile): | |
290 """ Return the selected syntax for the given profile | |
291 | |
292 @param profile: %(doc_profile)s | |
293 @return: profile selected syntax | |
294 """ | |
295 return self.host.memory.param_get_a(NAME, CATEGORY, profile_key=profile) | |
296 | |
297 def _log_error(self, failure, action="converting syntax"): | |
298 log.error( | |
299 "Error while {action}: {failure}".format(action=action, failure=failure) | |
300 ) | |
301 return failure | |
302 | |
303 def clean_style(self, styles_raw: str) -> str: | |
304 """"Clean unsafe CSS styles | |
305 | |
306 Remove styles not in the whitelist, or where the value doesn't match the regex | |
307 @param styles_raw: CSS styles | |
308 @return: cleaned styles | |
309 """ | |
310 styles: List[str] = styles_raw.split(";") | |
311 cleaned_styles = [] | |
312 for style in styles: | |
313 try: | |
314 key, value = style.split(":") | |
315 except ValueError: | |
316 continue | |
317 key = key.lower().strip() | |
318 if key not in STYLES_WHITELIST: | |
319 continue | |
320 value = value.lower().strip() | |
321 if not STYLES_ACCEPTED_VALUE.match(value): | |
322 continue | |
323 if value == "none": | |
324 continue | |
325 cleaned_styles.append((key, value)) | |
326 return "; ".join( | |
327 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles] | |
328 ) | |
329 | |
330 def clean_classes(self, classes_raw: str) -> str: | |
331 """Remove any non whitelisted class | |
332 | |
333 @param classes_raw: classes set on an element | |
334 @return: remaining classes (can be empty string) | |
335 """ | |
336 return " ".join(SAFE_CLASSES.intersection(classes_raw.split())) | |
337 | |
338 def clean_xhtml(self, xhtml): | |
339 """Clean XHTML text by removing potentially dangerous/malicious parts | |
340 | |
341 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean | |
342 @return (unicode): cleaned XHTML | |
343 """ | |
344 | |
345 if isinstance(xhtml, str): | |
346 try: | |
347 xhtml_elt = html.fromstring(xhtml) | |
348 except etree.ParserError as e: | |
349 if not xhtml.strip(): | |
350 return "" | |
351 log.error("Can't clean XHTML: {xhtml}".format(xhtml=xhtml)) | |
352 raise e | |
353 elif isinstance(xhtml, html.HtmlElement): | |
354 xhtml_elt = xhtml | |
355 else: | |
356 log.error("Only strings and HtmlElements can be cleaned") | |
357 raise exceptions.DataError | |
358 cleaner = clean.Cleaner( | |
359 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS | |
360 ) | |
361 xhtml_elt = cleaner.clean_html(xhtml_elt) | |
362 for elt in xhtml_elt.xpath("//*[@style]"): | |
363 elt.set("style", self.clean_style(elt.get("style"))) | |
364 for elt in xhtml_elt.xpath("//*[@class]"): | |
365 elt.set("class", self.clean_classes(elt.get("class"))) | |
366 # we remove self-closing elements for non-void elements | |
367 for element in xhtml_elt.iter(tag=etree.Element): | |
368 if not element.text: | |
369 if element.tag in VOID_ELEMENTS: | |
370 element.text = None | |
371 else: | |
372 element.text = '' | |
373 return html.tostring(xhtml_elt, encoding=str, method="xml") | |
374 | |
375 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, | |
376 profile=None): | |
377 """Convert a text between two syntaxes | |
378 | |
379 @param text: text to convert | |
380 @param syntax_from: source syntax (e.g. "markdown") | |
381 @param syntax_to: dest syntax (e.g.: "XHTML") | |
382 @param safe: clean resulting XHTML to avoid malicious code if True | |
383 @param profile: needed only when syntax_from or syntax_to is set to | |
384 _SYNTAX_CURRENT | |
385 @return(unicode): converted text | |
386 """ | |
387 # FIXME: convert should be abled to handle domish.Element directly | |
388 # when dealing with XHTML | |
389 # TODO: a way for parser to return parsing errors/warnings | |
390 | |
391 if syntax_from == _SYNTAX_CURRENT: | |
392 syntax_from = self.get_current_syntax(profile) | |
393 else: | |
394 syntax_from = syntax_from.lower().strip() | |
395 if syntax_to == _SYNTAX_CURRENT: | |
396 syntax_to = self.get_current_syntax(profile) | |
397 else: | |
398 syntax_to = syntax_to.lower().strip() | |
399 syntaxes = self.syntaxes | |
400 if syntax_from not in syntaxes: | |
401 raise exceptions.NotFound(syntax_from) | |
402 if syntax_to not in syntaxes: | |
403 raise exceptions.NotFound(syntax_to) | |
404 d = None | |
405 | |
406 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]: | |
407 d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text) | |
408 else: | |
409 d = deferToThread(syntaxes[syntax_from]["to"], text) | |
410 | |
411 # TODO: keep only body element and change it to a div here ? | |
412 | |
413 if safe: | |
414 d.addCallback(self.clean_xhtml) | |
415 | |
416 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]: | |
417 d.addCallback(syntaxes[syntax_to]["from"]) | |
418 else: | |
419 d.addCallback(lambda xhtml: deferToThread(syntaxes[syntax_to]["from"], xhtml)) | |
420 | |
421 # converters can add new lines that disturb the microblog change detection | |
422 d.addCallback(lambda text: text.rstrip()) | |
423 return d | |
424 | |
425 def add_syntax(self, name, to_xhtml_cb, from_xhtml_cb, flags=None): | |
426 """Add a new syntax to the manager | |
427 | |
428 @param name: unique name of the syntax | |
429 @param to_xhtml_cb: callback to convert from syntax to XHTML | |
430 @param from_xhtml_cb: callback to convert from XHTML to syntax | |
431 @param flags: set of optional flags, can be: | |
432 TextSyntaxes.OPT_DEFAULT: use as the default syntax (replace former one) | |
433 TextSyntaxes.OPT_HIDDEN: do not show in parameters | |
434 TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred) | |
435 """ | |
436 flags = flags if flags is not None else [] | |
437 if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags: | |
438 raise ValueError( | |
439 "{} and {} are mutually exclusive".format( | |
440 TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT | |
441 ) | |
442 ) | |
443 | |
444 syntaxes = self.syntaxes | |
445 key = name.lower().strip() | |
446 if key in syntaxes: | |
447 raise exceptions.ConflictError( | |
448 "This syntax key already exists: {}".format(key) | |
449 ) | |
450 syntaxes[key] = { | |
451 "name": name, | |
452 "to": to_xhtml_cb, | |
453 "from": from_xhtml_cb, | |
454 "flags": flags, | |
455 } | |
456 if TextSyntaxes.OPT_DEFAULT in flags: | |
457 TextSyntaxes.default_syntax = key | |
458 | |
459 self._update_param_options() | |
460 | |
461 def get_syntax(self, name): | |
462 """get syntax key corresponding to a name | |
463 | |
464 @raise exceptions.NotFound: syntax doesn't exist | |
465 """ | |
466 key = name.lower().strip() | |
467 if key in self.syntaxes: | |
468 return key | |
469 raise exceptions.NotFound | |
470 | |
471 def _remove_markups(self, xhtml): | |
472 """Remove XHTML markups from the given string. | |
473 | |
474 @param xhtml: the XHTML string to be cleaned | |
475 @return: the cleaned string | |
476 """ | |
477 cleaner = clean.Cleaner(kill_tags=["style"]) | |
478 cleaned = cleaner.clean_html(html.fromstring(xhtml)) | |
479 return html.tostring(cleaned, encoding=str, method="text") |