comparison libervia/backend/plugins/plugin_misc_text_syntaxes.py @ 4071:4b842c1fb686

refactoring: renamed `sat` package to `libervia.backend`
author Goffi <goffi@goffi.org>
date Fri, 02 Jun 2023 11:49:51 +0200
parents sat/plugins/plugin_misc_text_syntaxes.py@00dbc3370d35
children 26b7ed2817da
comparison
equal deleted inserted replaced
4070:d10748475025 4071:4b842c1fb686
1 #!/usr/bin/env python3
2
3
4 # SAT plugin for managing various text syntaxes
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 from functools import partial
21 from html import escape
22 import re
23 from typing import Set
24
25 from twisted.internet import defer
26 from twisted.internet.threads import deferToThread
27
28 from libervia.backend.core import exceptions
29 from libervia.backend.core.constants import Const as C
30 from libervia.backend.core.i18n import D_, _
31 from libervia.backend.core.log import getLogger
32 from libervia.backend.tools import xml_tools
33
34 try:
35 from lxml import html
36 from lxml.html import clean
37 from lxml import etree
38 except ImportError:
39 raise exceptions.MissingModule(
40 "Missing module lxml, please download/install it from http://lxml.de/"
41 )
42
43 log = getLogger(__name__)
44
45 CATEGORY = D_("Composition")
46 NAME = "Syntax"
47 _SYNTAX_XHTML = "xhtml" # must be lower case
48 _SYNTAX_CURRENT = "@CURRENT@"
49
50 # TODO: check/adapt following list
51 # list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html)
52 STYLES_WHITELIST = (
53 "azimuth",
54 "background-color",
55 "border-bottom-color",
56 "border-collapse",
57 "border-color",
58 "border-left-color",
59 "border-right-color",
60 "border-top-color",
61 "clear",
62 "color",
63 "cursor",
64 "direction",
65 "display",
66 "elevation",
67 "float",
68 "font",
69 "font-family",
70 "font-size",
71 "font-style",
72 "font-variant",
73 "font-weight",
74 "height",
75 "letter-spacing",
76 "line-height",
77 "overflow",
78 "pause",
79 "pause-after",
80 "pause-before",
81 "pitch",
82 "pitch-range",
83 "richness",
84 "speak",
85 "speak-header",
86 "speak-numeral",
87 "speak-punctuation",
88 "speech-rate",
89 "stress",
90 "text-align",
91 "text-decoration",
92 "text-indent",
93 "unicode-bidi",
94 "vertical-align",
95 "voice-family",
96 "volume",
97 "white-space",
98 "width",
99 )
100
101 # cf. https://www.w3.org/TR/html/syntax.html#void-elements
102 VOID_ELEMENTS = (
103 "area",
104 "base",
105 "br",
106 "col",
107 "embed",
108 "hr",
109 "img",
110 "input",
111 "keygen",
112 "link",
113 "menuitem",
114 "meta",
115 "param",
116 "source",
117 "track",
118 "wbr")
119
120 SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"}
121 SAFE_CLASSES = {
122 # those classes are used for code highlighting
123 "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh",
124 "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd",
125 "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd",
126 "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc",
127 "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write",
128 }
129 STYLES_VALUES_REGEX = (
130 r"^("
131 + "|".join(
132 [
133 "([a-z-]+)", # alphabetical names
134 "(#[0-9a-f]+)", # hex value
135 "(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))", # values with units (or not)
136 "rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)", # rgb function
137 "rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)", # rgba function
138 ]
139 )
140 + ") *(!important)?$"
141 ) # we accept "!important" at the end
142 STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX)
143
144 PLUGIN_INFO = {
145 C.PI_NAME: "Text syntaxes",
146 C.PI_IMPORT_NAME: "TEXT_SYNTAXES",
147 C.PI_TYPE: "MISC",
148 C.PI_MODES: C.PLUG_MODE_BOTH,
149 C.PI_PROTOCOLS: [],
150 C.PI_DEPENDENCIES: [],
151 C.PI_MAIN: "TextSyntaxes",
152 C.PI_HANDLER: "no",
153 C.PI_DESCRIPTION: _(
154 """Management of various text syntaxes (XHTML-IM, Markdown, etc)"""
155 ),
156 }
157
158
159 class TextSyntaxes(object):
160 """ Text conversion class
161 XHTML utf-8 is used as intermediate language for conversions
162 """
163
164 OPT_DEFAULT = "DEFAULT"
165 OPT_HIDDEN = "HIDDEN"
166 OPT_NO_THREAD = "NO_THREAD"
167 SYNTAX_XHTML = _SYNTAX_XHTML
168 SYNTAX_MARKDOWN = "markdown"
169 SYNTAX_TEXT = "text"
170 # default_syntax must be lower case
171 default_syntax = SYNTAX_XHTML
172
173
174 def __init__(self, host):
175 log.info(_("Text syntaxes plugin initialization"))
176 self.host = host
177 self.syntaxes = {}
178
179 self.params = """
180 <params>
181 <individual>
182 <category name="%(category_name)s" label="%(category_label)s">
183 <param name="%(name)s" label="%(label)s" type="list" security="0">
184 %(options)s
185 </param>
186 </category>
187 </individual>
188 </params>
189 """
190
191 self.params_data = {
192 "category_name": CATEGORY,
193 "category_label": _(CATEGORY),
194 "name": NAME,
195 "label": _(NAME),
196 "syntaxes": self.syntaxes,
197 }
198
199 self.add_syntax(
200 self.SYNTAX_XHTML,
201 lambda xhtml: defer.succeed(xhtml),
202 lambda xhtml: defer.succeed(xhtml),
203 TextSyntaxes.OPT_NO_THREAD,
204 )
205 # TODO: text => XHTML should add <a/> to url like in frontends
206 # it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar
207 self.add_syntax(
208 self.SYNTAX_TEXT,
209 lambda text: escape(text),
210 lambda xhtml: self._remove_markups(xhtml),
211 [TextSyntaxes.OPT_HIDDEN],
212 )
213 try:
214 import markdown, html2text
215 from markdown.extensions import Extension
216
217 # XXX: we disable raw HTML parsing by default, to avoid parsing error
218 # when the user is not aware of markdown and HTML
219 class EscapeHTML(Extension):
220 def extendMarkdown(self, md):
221 md.preprocessors.deregister('html_block')
222 md.inlinePatterns.deregister('html')
223
224 def _html2text(html, baseurl=""):
225 h = html2text.HTML2Text(baseurl=baseurl)
226 h.body_width = 0 # do not truncate the lines, it breaks the long URLs
227 return h.handle(html)
228
229 self.add_syntax(
230 self.SYNTAX_MARKDOWN,
231 partial(markdown.markdown,
232 extensions=[
233 EscapeHTML(),
234 'nl2br',
235 'codehilite',
236 'fenced_code',
237 'sane_lists',
238 'tables',
239 ],
240 extension_configs = {
241 "codehilite": {
242 "css_class": "highlight",
243 }
244 }),
245 _html2text,
246 [TextSyntaxes.OPT_DEFAULT],
247 )
248 except ImportError:
249 log.warning("markdown or html2text not found, can't use Markdown syntax")
250 log.info(
251 "You can download/install them from https://pythonhosted.org/Markdown/ "
252 "and https://github.com/Alir3z4/html2text/"
253 )
254 host.bridge.add_method(
255 "syntax_convert",
256 ".plugin",
257 in_sign="sssbs",
258 out_sign="s",
259 async_=True,
260 method=self.convert,
261 )
262 host.bridge.add_method(
263 "syntax_get", ".plugin", in_sign="s", out_sign="s", method=self.get_syntax
264 )
265 if xml_tools.clean_xhtml is None:
266 log.debug("Installing cleaning method")
267 xml_tools.clean_xhtml = self.clean_xhtml
268
269 def _update_param_options(self):
270 data_synt = self.syntaxes
271 default_synt = TextSyntaxes.default_syntax
272 syntaxes = []
273
274 for syntax in list(data_synt.keys()):
275 flags = data_synt[syntax]["flags"]
276 if TextSyntaxes.OPT_HIDDEN not in flags:
277 syntaxes.append(syntax)
278
279 syntaxes.sort(key=lambda synt: synt.lower())
280 options = []
281
282 for syntax in syntaxes:
283 selected = 'selected="true"' if syntax == default_synt else ""
284 options.append('<option value="%s" %s/>' % (syntax, selected))
285
286 self.params_data["options"] = "\n".join(options)
287 self.host.memory.update_params(self.params % self.params_data)
288
289 def get_current_syntax(self, profile):
290 """ Return the selected syntax for the given profile
291
292 @param profile: %(doc_profile)s
293 @return: profile selected syntax
294 """
295 return self.host.memory.param_get_a(NAME, CATEGORY, profile_key=profile)
296
297 def _log_error(self, failure, action="converting syntax"):
298 log.error(
299 "Error while {action}: {failure}".format(action=action, failure=failure)
300 )
301 return failure
302
303 def clean_style(self, styles_raw: str) -> str:
304 """"Clean unsafe CSS styles
305
306 Remove styles not in the whitelist, or where the value doesn't match the regex
307 @param styles_raw: CSS styles
308 @return: cleaned styles
309 """
310 styles: List[str] = styles_raw.split(";")
311 cleaned_styles = []
312 for style in styles:
313 try:
314 key, value = style.split(":")
315 except ValueError:
316 continue
317 key = key.lower().strip()
318 if key not in STYLES_WHITELIST:
319 continue
320 value = value.lower().strip()
321 if not STYLES_ACCEPTED_VALUE.match(value):
322 continue
323 if value == "none":
324 continue
325 cleaned_styles.append((key, value))
326 return "; ".join(
327 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
328 )
329
330 def clean_classes(self, classes_raw: str) -> str:
331 """Remove any non whitelisted class
332
333 @param classes_raw: classes set on an element
334 @return: remaining classes (can be empty string)
335 """
336 return " ".join(SAFE_CLASSES.intersection(classes_raw.split()))
337
338 def clean_xhtml(self, xhtml):
339 """Clean XHTML text by removing potentially dangerous/malicious parts
340
341 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean
342 @return (unicode): cleaned XHTML
343 """
344
345 if isinstance(xhtml, str):
346 try:
347 xhtml_elt = html.fromstring(xhtml)
348 except etree.ParserError as e:
349 if not xhtml.strip():
350 return ""
351 log.error("Can't clean XHTML: {xhtml}".format(xhtml=xhtml))
352 raise e
353 elif isinstance(xhtml, html.HtmlElement):
354 xhtml_elt = xhtml
355 else:
356 log.error("Only strings and HtmlElements can be cleaned")
357 raise exceptions.DataError
358 cleaner = clean.Cleaner(
359 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
360 )
361 xhtml_elt = cleaner.clean_html(xhtml_elt)
362 for elt in xhtml_elt.xpath("//*[@style]"):
363 elt.set("style", self.clean_style(elt.get("style")))
364 for elt in xhtml_elt.xpath("//*[@class]"):
365 elt.set("class", self.clean_classes(elt.get("class")))
366 # we remove self-closing elements for non-void elements
367 for element in xhtml_elt.iter(tag=etree.Element):
368 if not element.text:
369 if element.tag in VOID_ELEMENTS:
370 element.text = None
371 else:
372 element.text = ''
373 return html.tostring(xhtml_elt, encoding=str, method="xml")
374
375 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True,
376 profile=None):
377 """Convert a text between two syntaxes
378
379 @param text: text to convert
380 @param syntax_from: source syntax (e.g. "markdown")
381 @param syntax_to: dest syntax (e.g.: "XHTML")
382 @param safe: clean resulting XHTML to avoid malicious code if True
383 @param profile: needed only when syntax_from or syntax_to is set to
384 _SYNTAX_CURRENT
385 @return(unicode): converted text
386 """
387 # FIXME: convert should be abled to handle domish.Element directly
388 # when dealing with XHTML
389 # TODO: a way for parser to return parsing errors/warnings
390
391 if syntax_from == _SYNTAX_CURRENT:
392 syntax_from = self.get_current_syntax(profile)
393 else:
394 syntax_from = syntax_from.lower().strip()
395 if syntax_to == _SYNTAX_CURRENT:
396 syntax_to = self.get_current_syntax(profile)
397 else:
398 syntax_to = syntax_to.lower().strip()
399 syntaxes = self.syntaxes
400 if syntax_from not in syntaxes:
401 raise exceptions.NotFound(syntax_from)
402 if syntax_to not in syntaxes:
403 raise exceptions.NotFound(syntax_to)
404 d = None
405
406 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]:
407 d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text)
408 else:
409 d = deferToThread(syntaxes[syntax_from]["to"], text)
410
411 # TODO: keep only body element and change it to a div here ?
412
413 if safe:
414 d.addCallback(self.clean_xhtml)
415
416 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]:
417 d.addCallback(syntaxes[syntax_to]["from"])
418 else:
419 d.addCallback(lambda xhtml: deferToThread(syntaxes[syntax_to]["from"], xhtml))
420
421 # converters can add new lines that disturb the microblog change detection
422 d.addCallback(lambda text: text.rstrip())
423 return d
424
425 def add_syntax(self, name, to_xhtml_cb, from_xhtml_cb, flags=None):
426 """Add a new syntax to the manager
427
428 @param name: unique name of the syntax
429 @param to_xhtml_cb: callback to convert from syntax to XHTML
430 @param from_xhtml_cb: callback to convert from XHTML to syntax
431 @param flags: set of optional flags, can be:
432 TextSyntaxes.OPT_DEFAULT: use as the default syntax (replace former one)
433 TextSyntaxes.OPT_HIDDEN: do not show in parameters
434 TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred)
435 """
436 flags = flags if flags is not None else []
437 if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags:
438 raise ValueError(
439 "{} and {} are mutually exclusive".format(
440 TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT
441 )
442 )
443
444 syntaxes = self.syntaxes
445 key = name.lower().strip()
446 if key in syntaxes:
447 raise exceptions.ConflictError(
448 "This syntax key already exists: {}".format(key)
449 )
450 syntaxes[key] = {
451 "name": name,
452 "to": to_xhtml_cb,
453 "from": from_xhtml_cb,
454 "flags": flags,
455 }
456 if TextSyntaxes.OPT_DEFAULT in flags:
457 TextSyntaxes.default_syntax = key
458
459 self._update_param_options()
460
461 def get_syntax(self, name):
462 """get syntax key corresponding to a name
463
464 @raise exceptions.NotFound: syntax doesn't exist
465 """
466 key = name.lower().strip()
467 if key in self.syntaxes:
468 return key
469 raise exceptions.NotFound
470
471 def _remove_markups(self, xhtml):
472 """Remove XHTML markups from the given string.
473
474 @param xhtml: the XHTML string to be cleaned
475 @return: the cleaned string
476 """
477 cleaner = clean.Cleaner(kill_tags=["style"])
478 cleaned = cleaner.clean_html(html.fromstring(xhtml))
479 return html.tostring(cleaned, encoding=str, method="text")